In [1]:
import os
import torch
import torchaudio
import pandas as pd
import numpy as np
from funasr import AutoModel
import soundfile as sf
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from speechbrain.inference.VAD import VAD
import seaborn as sns
from pyannote.core import Segment
from pyannote.audio import Pipeline

Notice: ffmpeg is not installed. torchaudio is used to load audio
If you want to use ffmpeg backend to load audio, please install it by:
	sudo apt install ffmpeg # ubuntu
	# brew install ffmpeg # mac


In [2]:
# silero

SAMPLING_RATE = 16000
torch.set_num_threads(1)

torch.hub.set_dir('../models/.cache')
model_silero, utils_silero = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=False)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils_silero

 # pyannote
pipeline = Pipeline.from_pretrained (
        "pyannote/voice-activity-detection",
         use_auth_token="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", # Once while downloading the model
        cache_dir="../models/.cache"
        )

# speechbrain
vad = VAD.from_hparams(
        source="speechbrain/vad-crdnn-libriparty",
        savedir="../models/.cache"  # Save the model in a cache folder
)

# funasr
model_funasr = AutoModel(model="fsmn-vad", model_revision="v2.0.4", device="cuda")

Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to ../models/.cache/master.zip
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.3.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.3.1. Bad things might happen unless you revert torch to 1.x.


2024-10-21 15:47:25,205 - modelscope - INFO - PyTorch version 2.3.1 Found.
2024-10-21 15:47:25,207 - modelscope - INFO - Loading ast index from /Users/saurabh/.cache/modelscope/ast_indexer
2024-10-21 15:47:25,326 - modelscope - INFO - Loading done! Current index file version is 1.15.0, with md5 270895fc7d76b5c7655183a5b4e2f1dd and a total number of 980 components indexed
2024-10-21 15:47:27,375 - modelscope - INFO - Use user-specified model revision: v2.0.4


In [3]:
import sys
sys.path.append("/Users/saurabh/Documents/projects/Voice-Activity-Detection")

from helper import vad_inference_pyannote, print_timestamps_pyannote, run_vad_on_noisy_audio_pyannote, visualize_metrics_vs_SNR_pyannote
from helper import vad_inference_funasr, convert_to_timestamps_funasr, run_vad_on_noisy_audio_funasr, visualize_metrics_vs_SNR_funasr
from helper import vad_inference_silero, print_timestamps_silero, run_vad_on_noisy_audio_silero, visualize_metrics_vs_SNR_silero
from helper import vad_inference_speechbrain, print_timestamps_speechbrain, run_vad_on_noisy_audio_speechbrain, visualize_metrics_vs_SNR_speechbrain
from helper.vad import parse_annotations_file_bh, evaluate_vad, add_noise, save_audio, plot_SNR, extract_metrics, visualize_all_metrics, evaluate_vad_cmatrix, plot_confusion_matrices, get_file_paths, read_path, parse_annotations_file, average_metrics, show_vad_matrix_bh, save_results_to_csv, extract_speech_segments, count_continuous_zeros_after_start_segments, count_continuous_ones_after_end_segments, calculate_fec, calculate_msc, calculate_over, calculate_nds, save_results_to_csv1, show_vad_metrics_matrix1, save_multiple_speech_segments_as_text, extract_filenames_as_string

In [4]:
def get_filename(file_path):
    file_name = file_path.split('/')[-1]
    file_id = file_name.split('.')[0]
    return file_id

def parse_speech_segments(file_path):
    speech_segments = []
    with open(file_path, 'r') as file:
        for line in file:
            label, start_time, end_time = line.strip().split()
            if not label in ["!SIL"]:  # Only process lines where the label is 'S'

                
                speech_segments.append({
                    'speech': [round(float(start_time), 6), round(float(end_time), 6)]
                })
    return speech_segments

def generate_speech_segments_from_nonspeech_segments(nonspeech_segments, total_duration, margin=0.001):
    speech_segments = []
    current_time = 0.0

    for nonspeech in nonspeech_segments:
        nonspeech_start, nonspeech_end = nonspeech['nonspeech']

        if nonspeech_start > current_time:
            speech_segments.append({'speech': [round(current_time, 6), round(nonspeech_start - margin, 6)]})

        current_time = nonspeech_end + margin

    if current_time < total_duration:
        speech_segments.append({'speech': [round(current_time, 6), round(total_duration, 6)]})

    return speech_segments

def get_wav_duration(file_path):
    with wave.open(file_path, 'rb') as wav_file:
        n_frames = wav_file.getnframes()
        frame_rate = wav_file.getframerate()
        duration = n_frames / float(frame_rate)
        
        return duration

def extract_speech_segments_from_json(file_path, key):
    with open(file_path, 'r') as file:
        data = json.load(file)

    if key not in data:
        return []
        
    sil_value = data[key]["sil"]
    parsed_sil = eval(sil_value)
    output = [{'nonspeech': [float(num) for num in sublist]} for sublist in parsed_sil]

    wav_file = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/bh_dataset/bh_audios/" + key + ".wav"
    total_duraction = get_wav_duration(wav_file)
    
    output = generate_speech_segments_from_nonspeech_segments(output, total_duraction)
    
    return output

def show_vad_metrics_matrix1(metrics_fec, metrics_msc, metrics_over, metrics_nds, flag):
    models = ['Pyannote', 'FunASR', 'Silero', 'SpeechBrain', 'ASRmodel', 'newmodel']
    metrics = ['FEC', 'MSC', 'OVER', 'NDS']
    
    combined_data = {metric: {model: [] for model in models} for metric in metrics}
    
    for model_name in models:
        combined_data['FEC'][model_name] = metrics_fec[model_name]
        combined_data['MSC'][model_name] = metrics_msc[model_name]
        combined_data['OVER'][model_name] = metrics_over[model_name]
        combined_data['NDS'][model_name] = metrics_nds[model_name]
    
    average_data = {metric: {model: np.mean(combined_data[metric][model]) for model in models} for metric in metrics}
    
    df_combined = pd.DataFrame(average_data).T
    
    if flag:
        print(df_combined)

    plt.figure(figsize=(12, 8))
    plt.title("VAD Metrics Comparison")
    sns.heatmap(df_combined, annot=True, cmap="YlGnBu", fmt=".3f")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.show()
    
def save_results_to_csv1(metrics_fec, metrics_msc, metrics_over, metrics_nds, model_names, output_file, label_paths):
    all_results = []
    num_files = len(metrics_fec[model_names[0]])  # Assumes all metrics have the same number of files

    for file_idx in range(num_files):
        file = label_paths[file_idx].split('.')[0].split('/')[-1]
        
        for model_name in model_names:
            fec_value = metrics_fec[model_name][file_idx]
            msc_value = metrics_msc[model_name][file_idx]
            over_value = metrics_over[model_name][file_idx]
            nds_value = metrics_nds[model_name][file_idx]

            result = {
                'model': model_name,
                'file index': file_idx,
                'audio file': file,
                'FEC': fec_value,
                'MSC': msc_value,
                'OVER': over_value,
                'NDS': nds_value
            }
            all_results.append(result)
    
    df = pd.DataFrame(all_results)
    df.to_csv(output_file, index=False)

In [5]:
wav_directory = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/bh_dataset/188_samples/188_audio"
label_directory = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/bh_dataset/188_samples/188_label_for_FE"
transcript_directory = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans"

_, label_paths = read_path(wav_directory, label_directory)
audio_paths, forced_paths = read_path(wav_directory, transcript_directory)

audio_paths.sort()
label_paths.sort()
forced_paths.sort()

annotated_segments = [parse_annotations_file_bh(label_path) for label_path in label_paths]

In [8]:
def filter_speech_segments(segments):
    # Keep only the dictionaries where the key is 'speech'
    speech_segments = [segment for segment in segments if 'speech' in segment]
    return speech_segments

def merge_speech_segments(speech_segments):
    if not speech_segments:
        return []

    start_time = speech_segments[0]['speech'][0]
    end_time = speech_segments[-1]['speech'][1]

    return [{'speech': [start_time, end_time]}]

In [9]:
res_silero = []
res_speechbrain = []

rr = []

In [10]:
for i in range(len(audio_paths)):
    silero = vad_inference_silero(audio_paths[i], model_silero, utils_silero, sampling_rate=SAMPLING_RATE)
    speechbrain = vad_inference_speechbrain(audio_paths[i], vad)

    silero = print_timestamps_silero(silero)
    speechbrain = print_timestamps_speechbrain(speechbrain)


    silero = merge_speech_segments(silero)
    speechbrain = merge_speech_segments(speechbrain)

    
    res_silero.append(silero)
    res_speechbrain.append(speechbrain)

In [11]:
rr.sort(reverse=True)

for index in rr:
    audio_paths.pop(index)
    label_paths.pop(index)
    annotated_segments.pop(index)

In [12]:
file_names = extract_filenames_as_string(label_paths)

save_multiple_speech_segments_as_text(res_silero, file_names, "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/bh_dataset/predicted_for_FE/silero")
save_multiple_speech_segments_as_text(res_speechbrain, file_names, "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/bh_dataset/predicted_for_FE/speechbrain")

In [13]:
import os

def load_filenames(file_path):
    """
    Reads the filenames from a text file.
    
    Args:
        file_path (str): Path to the file containing filenames.
        
    Returns:
        list: List of filenames as strings.
    """
    with open(file_path, 'r') as file:
        filenames = file.read().splitlines()
    return filenames

def read_file(file_path):
    """
    Reads the content of a file.
    
    Args:
        file_path (str): Path to the file to read.
        
    Returns:
        str: Content of the file as a string.
    """
    with open(file_path, 'r') as file:
        return file.read()

def find_and_compare_files(directories, filenames):
    """
    Finds matching files in three directories based on provided filenames and prints their content.
    
    Args:
        directories (list): List of directory paths.
        filenames (list): List of filenames to search for.
    """
    for filename in filenames:
        found_files = {}
        for directory in directories:
            file_path = os.path.join(directory, filename + '.txt')  # Construct the full file path
            if os.path.exists(file_path):
                found_files[directory] = read_file(file_path)
        
        if found_files:
            print(f"\nMatching file content for '{filename}':")
            for directory, content in found_files.items():
                print(f"\nDirectory: {directory}")
                print(f"Content of {filename}.txt:\n{content}")
        else:
            print(f"\nNo matching file found for '{filename}'.")

if __name__ == "__main__":
    # Input paths
    input_txt_path = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/evaluation/vad_bhData/name.txt"
    dir1 = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/silero"
    dir2 = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/speechbrain"
    dir3 = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans_for_FE"
    
    # Load the filenames from the input text file
    filenames = load_filenames(input_txt_path)
    
    # Directories to search for matching files
    directories = [dir1, dir2, dir3]
    
    # Find and compare files
    find_and_compare_files(directories, filenames)



Matching file content for '281474977585189':

Directory: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/silero
Content of 281474977585189.txt:
1.346000	7.102000	speech


Directory: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/speechbrain
Content of 281474977585189.txt:
1.380000	6.970000	speech


Directory: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans_for_FE
Content of 281474977585189.txt:
S	1.35	1.77


Matching file content for '281474977594045':

Directory: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/silero
Content of 281474977594045.txt:
1.666000	3.966000	speech


Directory: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/speechbrain
Content of 281474977594045.txt:
1.630000	3.900000	speech


Directory: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans_for_FE
Content of 281474

In [20]:
import os

def load_filenames(file_path):
    """
    Reads the filenames from a text file.
    
    Args:
        file_path (str): Path to the file containing filenames.
        
    Returns:
        list: List of filenames as strings.
    """
    with open(file_path, 'r') as file:
        filenames = file.read().splitlines()
    return filenames

def clean_directories(directories, filenames):
    """
    Removes files from the directories that are not listed in the filenames.
    
    Args:
        directories (list): List of directory paths.
        filenames (list): List of filenames to keep.
    """
    for directory in directories:
        for file in os.listdir(directory):
            file_name, ext = os.path.splitext(file)
            if ext == '.txt' and file_name not in filenames:
                file_path = os.path.join(directory, file)
                os.remove(file_path)
                print(f"Removed: {file_path}")

if __name__ == "__main__":
    # Input paths
    input_txt_path = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/evaluation/vad_bhData/name.txt"
    dir1 = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/silero"
    dir2 = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/predicted_for_FE/speechbrain"
    dir3 = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans_for_FE"
    
    # Load the filenames from the input text file
    filenames = load_filenames(input_txt_path)
    
    # Directories to clean
    directories = [dir1, dir2, dir3]
    
    # Clean the directories by keeping only the files listed in the input file
    clean_directories(directories, filenames)


In [21]:
import os

def load_filenames(file_path):
    """
    Reads the filenames from a text file.
    
    Args:
        file_path (str): Path to the file containing filenames.
        
    Returns:
        list: List of filenames (without extension) as strings.
    """
    with open(file_path, 'r') as file:
        filenames = file.read().splitlines()
    return filenames

def clean_directory(directory, filenames, extension):
    """
    Removes files from the directory that are not listed in the filenames and have the given extension.
    
    Args:
        directory (str): Path to the directory to clean.
        filenames (list): List of filenames (without extension) to keep.
        extension (str): File extension to check (e.g., '.wav' or '.txt').
    """
    for file in os.listdir(directory):
        file_name, ext = os.path.splitext(file)
        if ext == extension and file_name not in filenames:
            file_path = os.path.join(directory, file)
            os.remove(file_path)
            print(f"Removed: {file_path}")

if __name__ == "__main__":
    # Input paths
    input_txt_path = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/evaluation/vad_bhData/name.txt"
    wav_dir = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_audio"
    txt_dir = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans"
    
    # Load the filenames from the input text file (without extensions)
    filenames = load_filenames(input_txt_path)
    
    # Clean the directories by keeping only the files listed in the input file
    clean_directory(wav_dir, filenames, '.wav')  # Clean .wav files
    clean_directory(txt_dir, filenames, '.txt')  # Clean .txt files


Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474982330859.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474980729400.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474977903187.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474982649957.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474982648679.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474978100266.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474981075344.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474980294565.txt
Removed: /Users/saurabh/Documents/projects/Voice-Activity-Detection/testing/188_trans/281474982587339.txt
Removed: /Users/saurabh/Documents/projects/Voi

In [1]:
import os

def print_txt_files_content(directory_path):
    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        # Check if the file is a .txt file
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            print(f"Contents of {filename}:\n")
            # Open and read the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                print(content)
                print("\n" + "="*50 + "\n")  # Separator between files

# Replace 'your_directory_path' with the actual directory containing your .txt files
directory_path = "/Users/saurabh/Documents/projects/Voice-Activity-Detection/data/bh_dataset/188_samples/188_trans"
print_txt_files_content(directory_path)


Contents of 281474982330859.txt:

281474982330859	0.8999999999999999	1.65
281474982330859	1.6500000000000001	1.9500000000000002
281474982330859	1.9500000000000002	2.5500000000000003
281474982330859	2.55	2.9099999999999997
281474982330859	2.91	3.24
281474982330859	3.33	3.99
281474982330859	3.99	4.23
281474982330859	4.2299999999999995	4.8
281474982330859	4.800000000000001	5.4
281474982330859	5.4	5.94



Contents of 281474982801729.txt:

281474982801729	0.6000000000000001	1.1700000000000002



Contents of 281474980729400.txt:

281474980729400	0.72	1.32
281474980729400	1.32	1.4100000000000001
281474980729400	1.41	1.71
281474980729400	1.71	1.89
281474980729400	1.8900000000000001	2.2800000000000002
281474980729400	2.2800000000000002	2.49
281474980729400	2.4899999999999998	2.82
281474980729400	2.82	2.9699999999999998
281474980729400	2.9699999999999998	3.4499999999999997
281474980729400	3.4499999999999997	3.9899999999999998
281474980729400	4.0200000000000005	4.23
281474980729400	4.229999999999