In [None]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from funasr import AutoModel
import soundfile as sf
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from speechbrain.inference.VAD import VAD
import seaborn as sns
from pyannote.core import Segment
from pyannote.audio import Pipeline

In [2]:
# silero

SAMPLING_RATE = 16000
torch.set_num_threads(1)

torch.hub.set_dir('../models/.cache')
model_silero, utils_silero = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=False)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils_silero

 # pyannote
pipeline = Pipeline.from_pretrained (
        "pyannote/voice-activity-detection",
         use_auth_token="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", # Once while downloading the model
        cache_dir="../models/.cache"
        )

# speechbrain
vad = VAD.from_hparams(
        source="speechbrain/vad-crdnn-libriparty",
        savedir="../models/.cache"  # Save the model in a cache folder
)

# funasr
model_funasr = AutoModel(model="fsmn-vad", model_revision="v2.0.4", device="cuda")

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to ../models/.cache/master.zip
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.3.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.3.1. Bad things might happen unless you revert torch to 1.x.


2024-10-20 14:16:14,336 - modelscope - INFO - PyTorch version 2.3.1 Found.
2024-10-20 14:16:14,337 - modelscope - INFO - Loading ast index from /Users/saurabh/.cache/modelscope/ast_indexer
2024-10-20 14:16:14,449 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 270895fc7d76b5c7655183a5b4e2f1dd and a total number of 980 components indexed
2024-10-20 14:16:16,108 - modelscope - INFO - Use user-specified model revision: v2.0.4


In [3]:
import sys
sys.path.append("/home/sgeadmin/Saurabh_Kushwaha/vad/")

from helper import vad_inference_pyannote, print_timestamps_pyannote, run_vad_on_noisy_audio_pyannote, visualize_metrics_vs_SNR_pyannote
from helper import vad_inference_funasr, convert_to_timestamps_funasr, run_vad_on_noisy_audio_funasr, visualize_metrics_vs_SNR_funasr
from helper import vad_inference_silero, print_timestamps_silero, run_vad_on_noisy_audio_silero, visualize_metrics_vs_SNR_silero
from helper import vad_inference_speechbrain, print_timestamps_speechbrain, run_vad_on_noisy_audio_speechbrain, visualize_metrics_vs_SNR_speechbrain
from helper.vad import parse_annotations_file_bh, evaluate_vad, add_noise, save_audio, plot_SNR, extract_metrics, visualize_all_metrics, evaluate_vad_cmatrix, plot_confusion_matrices, get_file_paths, read_path, parse_annotations_file, average_metrics, show_vad_matrix_bh, save_results_to_csv, extract_speech_segments, count_continuous_zeros_after_start_segments, count_continuous_ones_after_end_segments, calculate_fec, calculate_msc, calculate_over, calculate_nds, save_results_to_csv1, show_vad_metrics_matrix1

In [4]:
def parse_speech_segments(file_path):
    speech_segments = []
    with open(file_path, 'r') as file:
        for line in file:
            label, start_time, end_time = line.strip().split()
            if not label in ["!SIL"]:  # Only process lines where the label is 'S'

                
                speech_segments.append({
                    'speech': [round(float(start_time), 6), round(float(end_time), 6)]
                })
    return speech_segments
    
def show_vad_metrics_matrix1(metrics_fec, metrics_msc, metrics_over, metrics_nds, flag):
    models = ['Pyannote', 'FunASR', 'Silero', 'SpeechBrain', 'ASRmodel']
    metrics = ['FEC', 'MSC', 'OVER', 'NDS']
    
    combined_data = {metric: {model: [] for model in models} for metric in metrics}
    
    for model_name in models:
        combined_data['FEC'][model_name] = metrics_fec[model_name]
        combined_data['MSC'][model_name] = metrics_msc[model_name]
        combined_data['OVER'][model_name] = metrics_over[model_name]
        combined_data['NDS'][model_name] = metrics_nds[model_name]
    
    average_data = {metric: {model: np.mean(combined_data[metric][model]) for model in models} for metric in metrics}
    df_combined = pd.DataFrame(average_data).T
    
    if flag:
        print(df_combined)

    plt.figure(figsize=(12, 8))
    plt.title("VAD Metrics Comparison")
    sns.heatmap(df_combined, annot=True, cmap="YlGnBu", fmt=".3f")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.show()
    
def save_results_to_csv1(metrics_fec, metrics_msc, metrics_over, metrics_nds, model_names, output_file, label_paths):
    all_results = []
    num_files = len(metrics_fec[model_names[0]])  # Assumes all metrics have the same number of files

    # Iterate over each file
    for file_idx in range(num_files):
        # Extract the audio file name from the label path
        file = label_paths[file_idx].split('.')[0].split('/')[-1]
        
        # Iterate over each model and gather its corresponding metric values
        for model_name in model_names:
            fec_value = metrics_fec[model_name][file_idx]
            msc_value = metrics_msc[model_name][file_idx]
            over_value = metrics_over[model_name][file_idx]
            nds_value = metrics_nds[model_name][file_idx]

            # Create a result dictionary for the current model and file
            result = {
                'model': model_name,
                'file index': file_idx,
                'audio file': file,
                'FEC': fec_value,
                'MSC': msc_value,
                'OVER': over_value,
                'NDS': nds_value
            }
            all_results.append(result)
    
    # Convert the results into a DataFrame and save as a CSV file
    df = pd.DataFrame(all_results)
    df.to_csv(output_file, index=False)

In [5]:
wav_directory = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_audio"
label_directory = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_label"
transcript_directory = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_trans"

_, label_paths = read_path(wav_directory, label_directory)
audio_paths, forced_paths = read_path(wav_directory, transcript_directory)

audio_paths.sort()
label_paths.sort()
forced_paths.sort()

annotated_segments = [parse_annotations_file_bh(label_path) for label_path in label_paths]

label_paths = label_paths[:20]
audio_paths = audio_paths[:20]
annotated_segments = annotated_segments[:20]
forced_paths = forced_paths[:20]

In [6]:
cmatrix_pyannote = []
cmatrix_silero = []
cmatrix_speechbrain = []
cmatrix_funasr = []
cmatrix_ASRmodel = []

rr = []

In [None]:
for i in range(len(audio_paths)):
    pyannote = vad_inference_pyannote(pipeline, audio_paths[i])
    funasr = vad_inference_funasr(audio_paths[i], model_funasr)
    silero = vad_inference_silero(audio_paths[i], model_silero, utils_silero, sampling_rate=SAMPLING_RATE)
    speechbrain = vad_inference_speechbrain(audio_paths[i], vad)

    pyannote = print_timestamps_pyannote(pyannote)
    funasr = convert_to_timestamps_funasr(funasr)
    silero = print_timestamps_silero(silero)
    speechbrain = print_timestamps_speechbrain(speechbrain)
    ASRmodel = parse_speech_segments(forced_paths[i])

    if(pyannote == [] or funasr == [] or silero == [] or speechbrain == [] or ASRmodel == []):
        rr.append(i)
        continue

    cmatrix_pyannote.append(evaluate_vad_cmatrix(pyannote, annotated_segments[i]))
    cmatrix_silero.append(evaluate_vad_cmatrix(silero, annotated_segments[i]))
    cmatrix_speechbrain.append(evaluate_vad_cmatrix(speechbrain, annotated_segments[i]))
    cmatrix_funasr.append(evaluate_vad_cmatrix(funasr, annotated_segments[i]))
    cmatrix_ASRmodel.append(evaluate_vad_cmatrix(ASRmodel, annotated_segments[i]))

In [8]:
rr.sort(reverse=True)

for index in rr:
    audio_paths.pop(index)
    label_paths.pop(index)
    annotated_segments.pop(index)

In [9]:
# Initialize additional metrics containers
metrics_fec = {"Pyannote": [], "FunASR": [], "Silero": [], "SpeechBrain": [], "ASRmodel": []}
metrics_msc = {"Pyannote": [], "FunASR": [], "Silero": [], "SpeechBrain": [], "ASRmodel": []}
metrics_over = {"Pyannote": [], "FunASR": [], "Silero": [], "SpeechBrain": [], "ASRmodel": []}
metrics_nds = {"Pyannote": [], "FunASR": [], "Silero": [], "SpeechBrain": [], "ASRmodel": []}

# Evaluate each model on each audio file
for i in range(len(audio_paths)):
    # Compute additional metrics for Pyannote
    metrics_fec["Pyannote"].append(calculate_fec(cmatrix_pyannote[i][1], cmatrix_pyannote[i][0]))
    metrics_msc["Pyannote"].append(calculate_msc(cmatrix_pyannote[i][1], cmatrix_pyannote[i][0]))
    metrics_over["Pyannote"].append(calculate_over(cmatrix_pyannote[i][1], cmatrix_pyannote[i][0]))
    metrics_nds["Pyannote"].append(calculate_nds(cmatrix_pyannote[i][1], cmatrix_pyannote[i][0]))

    # Compute additional metrics for FunASR
    metrics_fec["FunASR"].append(calculate_fec(cmatrix_funasr[i][1], cmatrix_funasr[i][0]))
    metrics_msc["FunASR"].append(calculate_msc(cmatrix_funasr[i][1], cmatrix_funasr[i][0]))
    metrics_over["FunASR"].append(calculate_over(cmatrix_funasr[i][1], cmatrix_funasr[i][0]))
    metrics_nds["FunASR"].append(calculate_nds(cmatrix_funasr[i][1], cmatrix_funasr[i][0]))

    # Compute additional metrics for Silero
    metrics_fec["Silero"].append(calculate_fec(cmatrix_silero[i][1], cmatrix_silero[i][0]))
    metrics_msc["Silero"].append(calculate_msc(cmatrix_silero[i][1], cmatrix_silero[i][0]))
    metrics_over["Silero"].append(calculate_over(cmatrix_silero[i][1], cmatrix_silero[i][0]))
    metrics_nds["Silero"].append(calculate_nds(cmatrix_silero[i][1], cmatrix_silero[i][0]))

    # Compute additional metrics for SpeechBrain
    metrics_fec["SpeechBrain"].append(calculate_fec(cmatrix_speechbrain[i][1], cmatrix_speechbrain[i][0]))
    metrics_msc["SpeechBrain"].append(calculate_msc(cmatrix_speechbrain[i][1], cmatrix_speechbrain[i][0]))
    metrics_over["SpeechBrain"].append(calculate_over(cmatrix_speechbrain[i][1], cmatrix_speechbrain[i][0]))
    metrics_nds["SpeechBrain"].append(calculate_nds(cmatrix_speechbrain[i][1], cmatrix_speechbrain[i][0]))

    # Compute additional metrics for ASRmodel
    metrics_fec["ASRmodel"].append(calculate_fec(cmatrix_ASRmodel[i][1], cmatrix_ASRmodel[i][0]))
    metrics_msc["ASRmodel"].append(calculate_msc(cmatrix_ASRmodel[i][1], cmatrix_ASRmodel[i][0]))
    metrics_over["ASRmodel"].append(calculate_over(cmatrix_ASRmodel[i][1], cmatrix_ASRmodel[i][0]))
    metrics_nds["ASRmodel"].append(calculate_nds(cmatrix_ASRmodel[i][1], cmatrix_ASRmodel[i][0]))

In [None]:
show_vad_metrics_matrix1(metrics_fec, metrics_msc, metrics_over, metrics_nds, True)

In [11]:
model_names = ['Pyannote', 'FunASR', 'Silero', 'SpeechBrain', 'ASRmodel']
save_results_to_csv1(metrics_fec, metrics_msc, metrics_over, metrics_nds, model_names, 'vad-new-vani-FA.csv', label_paths)