In [7]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
import json
import wave
from funasr import AutoModel
import soundfile as sf
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from speechbrain.inference.VAD import VAD
import seaborn as sns
from pyannote.core import Segment
from pyannote.audio import Pipeline

In [None]:
# silero

SAMPLING_RATE = 16000
torch.set_num_threads(1)

torch.hub.set_dir('../models/.cache')
model_silero, utils_silero = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=False)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils_silero

 # pyannote
pipeline = Pipeline.from_pretrained (
        "pyannote/voice-activity-detection",
         use_auth_token="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", # Once while downloading the model
        cache_dir="../models/.cache"
        )

# speechbrain
vad = VAD.from_hparams(
        source="speechbrain/vad-crdnn-libriparty",
        savedir="../models/.cache"  # Save the model in a cache folder
)

# funasr
model_funasr = AutoModel(model="fsmn-vad", model_revision="v2.0.4", device="cuda")

In [9]:
import sys
sys.path.append("/home/sgeadmin/Saurabh_Kushwaha/vad/")

from helper import vad_inference_pyannote, print_timestamps_pyannote, run_vad_on_noisy_audio_pyannote, visualize_metrics_vs_SNR_pyannote
from helper import vad_inference_funasr, convert_to_timestamps_funasr, run_vad_on_noisy_audio_funasr, visualize_metrics_vs_SNR_funasr
from helper import vad_inference_silero, print_timestamps_silero, run_vad_on_noisy_audio_silero, visualize_metrics_vs_SNR_silero
from helper import vad_inference_speechbrain, print_timestamps_speechbrain, run_vad_on_noisy_audio_speechbrain, visualize_metrics_vs_SNR_speechbrain
from helper.vad import parse_annotations_file_bh, evaluate_vad, add_noise, save_audio, plot_SNR, extract_metrics, visualize_all_metrics, evaluate_vad_cmatrix, plot_confusion_matrices, get_file_paths, read_path, parse_annotations_file, average_metrics, show_vad_matrix_bh, save_results_to_csv, extract_speech_segments, count_continuous_zeros_after_start_segments, count_continuous_ones_after_end_segments, calculate_fec, calculate_msc, calculate_over, calculate_nds, save_results_to_csv1, show_vad_metrics_matrix1

In [10]:
def get_filename(file_path):
    file_name = file_path.split('/')[-1]
    file_id = file_name.split('.')[0]
    return file_id


def generate_speech_segments_from_nonspeech_segments(nonspeech_segments, total_duration, margin=0.001):
    speech_segments = []
    current_time = 0.0

    for nonspeech in nonspeech_segments:
        nonspeech_start, nonspeech_end = nonspeech['nonspeech']

        if nonspeech_start > current_time:
            speech_segments.append({'speech': [round(current_time, 6), round(nonspeech_start - margin, 6)]})

        current_time = nonspeech_end + margin

    if current_time < total_duration:
        speech_segments.append({'speech': [round(current_time, 6), round(total_duration, 6)]})

    return speech_segments

def get_wav_duration(file_path):
    with wave.open(file_path, 'rb') as wav_file:
        n_frames = wav_file.getnframes()
        frame_rate = wav_file.getframerate()
        duration = n_frames / float(frame_rate)
        
        return duration

def extract_speech_segments_from_json(file_path, key):
    with open(file_path, 'r') as file:
        data = json.load(file)

    if key not in data:
        return []
        
    sil_value = data[key]["sil"]
    parsed_sil = eval(sil_value)
    output = [{'nonspeech': [float(num) for num in sublist]} for sublist in parsed_sil]

    wav_file = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_audio/" + key + ".wav"
    total_duraction = get_wav_duration(wav_file)
    output = generate_speech_segments_from_nonspeech_segments(output, total_duraction)
    
    return output

def parse_speech_segments(file_path):
    speech_segments = []
    with open(file_path, 'r') as file:
        for line in file:
            label, start_time, end_time = line.strip().split()
            
            speech_segments.append({
                'speech': [round(float(start_time), 6), round(float(end_time), 6)]
            })
    return speech_segments


def show_vad_matrix_bh(avg_pyannote, avg_funasr, avg_silero, avg_speechbrain, ASRmodel, newmodel, flag):
    models = ['Pyannote', 'FunASR', 'Silero', 'SpeechBrain', 'ASRmodel', 'newmodel']
    metrics = ['precision', 'recall', 'f1_score', 'accuracy', 'specificity', 'fdr', 'miss_rate']
    
    combined_data = {metric: {model: [] for model in models} for metric in metrics}
    
    for model_name, cmatrix in zip(models, [avg_pyannote, avg_funasr, avg_silero, avg_speechbrain, ASRmodel, newmodel]):
        for result in cmatrix:
            for metric in metrics:  # Ensure we're iterating over metrics
                combined_data[metric][model_name].append(result[metric])
    
    average_data = {metric: {model: np.mean(combined_data[metric][model]) for model in models} for metric in metrics}
    
    df_combined = pd.DataFrame(average_data).T
    
    if flag:
        print(df_combined)
    
    plt.figure(figsize=(12, 8))
    plt.title("Model Metrics Comparison")
    sns.heatmap(df_combined, annot=True, cmap="YlGnBu", fmt=".3f")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.show()

def save_results_to_csv(results, model_names, output_file, label_paths):
    all_results = []
    
    num_files = len(results[0]) 

    for file_idx in range(num_files):
        file = label_paths[file_idx].split('.')[0].split('/')[-1]
        
        for model_idx, model_name in enumerate(model_names):
            result = results[model_idx][file_idx]
            temp = {'model': model_name, 'file index': file_idx, 'audio file': file}
            
            temp.update(result)
            all_results.append(temp)

    df = pd.DataFrame(all_results)
    df.to_csv(output_file, index=False)


def merge_speech_segments(speech_segments):
    if not speech_segments:
        return []

    start_time = speech_segments[0]['speech'][0]
    end_time = speech_segments[-1]['speech'][1]

    return [{'speech': [start_time, end_time]}]

In [14]:
wav_directory = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_audio"
transcript_directory = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_trans"
label_directory = "/home/sgeadmin/Saurabh_Kushwaha/vad/data/bh_dataset/188_samples/188_label_for_FE"
file_path = "/home/sgeadmin/Saurabh_Kushwaha/vad/evaluation/vad_inhouse/vad_bh/output.json" 

_, label_paths = read_path(wav_directory, label_directory)
audio_paths, forced_paths = read_path(wav_directory, transcript_directory)

audio_paths.sort()
label_paths.sort()
forced_paths.sort

annotated_segments = [parse_annotations_file_bh(label_path) for label_path in label_paths]

audio_paths = audio_paths[:20]
label_paths = label_paths[:20]
forced_paths = forced_paths[:20]
annotated_segments = annotated_segments[:20]

In [15]:
result_pyannote = []
result_silero = []
result_speechbrain = []
result_funasr = []
result_ASRmodel = []
result_newmodel = []

rr = []

In [None]:
for i in range(len(audio_paths)):
    pyannote = vad_inference_pyannote(pipeline, audio_paths[i])
    funasr = vad_inference_funasr(audio_paths[i], model_funasr)
    silero = vad_inference_silero(audio_paths[i], model_silero, utils_silero, sampling_rate=SAMPLING_RATE)
    speechbrain = vad_inference_speechbrain(audio_paths[i], vad)
    
    pyannote = print_timestamps_pyannote(pyannote)
    funasr = convert_to_timestamps_funasr(funasr)
    silero = print_timestamps_silero(silero)
    speechbrain = print_timestamps_speechbrain(speechbrain)
    ASRmodel = parse_speech_segments(forced_paths[i])
    newmodel = extract_speech_segments_from_json(file_path , get_filename(audio_paths[i]))

    funasr = merge_speech_segments(funasr)
    pyannote = merge_speech_segments(pyannote)
    silero = merge_speech_segments(silero)
    speechbrain = merge_speech_segments(speechbrain)
    ASRmodel = merge_speech_segments(ASRmodel)
    newmodel = merge_speech_segments(newmodel) 

    if(pyannote != [] and funasr != [] and silero != [] and speechbrain != [] and ASRmodel != [] and newmodel != []):
        result_pyannote.append(evaluate_vad(pyannote, annotated_segments[i]))
        result_silero.append(evaluate_vad(silero, annotated_segments[i]))
        result_speechbrain.append(evaluate_vad(speechbrain, annotated_segments[i]))
        result_funasr.append(evaluate_vad(funasr, annotated_segments[i]))
        
        result_ASRmodel.append(evaluate_vad(ASRmodel, annotated_segments[i]))
        result_newmodel.append(evaluate_vad(newmodel, annotated_segments[i]))

In [17]:
rr.sort(reverse=True)

for index in rr:
    audio_paths.pop(index)
    label_paths.pop(index)
    annotated_segments.pop(index)

In [18]:
avg_funasr = average_metrics(result_funasr)
avg_pyannote = average_metrics(result_pyannote)
avg_speechbrain = average_metrics(result_speechbrain)
avg_silero = average_metrics(result_silero)
avg_ASRmodel = average_metrics(result_ASRmodel)
avg_newmodel = average_metrics(result_newmodel)

In [None]:
show_vad_matrix_bh(avg_pyannote, avg_funasr, avg_silero, avg_speechbrain, avg_ASRmodel, avg_newmodel, True)

In [None]:
output_file = "vad-matrix-FA.csv"
model_names = ['Pyannote', 'FunASR', 'Silero', 'SpeechBrain', 'ASRmodel']

# Save CSV file
save_results_to_csv(
    [result_pyannote, result_funasr, result_silero, result_speechbrain, result_ASRmodel],
    model_names,
    output_file,
    label_paths
)