In [45]:
from nemo.collections.asr.parts.utils import vad_utils

from math import ceil
from typing import List, Tuple, Dict
from pathlib import Path
import json
import pandas as pd
from tqdm import tqdm

# Global constants
window_length_in_sec = 0.01
split_duration = 20.0
num_workers = 1
prepared_manifest_vad_input = "temp_manifest.json"


def load_file(filepath):
    results = []
    with Path(filepath).open("r") as fin:
        for line in fin.readlines():
            results.append(line.strip())
    return results

def change_prefix_dir(data: list, prefix: Path, key: str = "vad_sd"):
    results = []
    for item in data:
        idx = item.index(key)
        filename = Path(item[idx+len(key)+1:])
        res = prefix / filename
        results.append(str(res))
    return results

def load_manifest(filepath):
    results = []
    with Path(filepath).open("r") as fin:
        for line in fin.readlines():
            results.append(json.loads(line.strip()))
    return results

def save_manifest(filepath, manifest):
    with Path(filepath).open('w') as fout:
        for item in manifest:
            fout.write(f"{json.dumps(item)}\n")

def load_rttm_file(filepath):
    data = pd.read_csv(filepath, sep=" ", delimiter=None, header=None)
    data = data.rename(columns={3: "start", 4: "dur", 7: "speaker"})

    data['start'] = data['start'].astype(float)
    data['dur'] = data['dur'].astype(float)
    data['end']= data['start'] + data['dur']

    data = data.sort_values(by=['start'])
    data['segment'] = list(zip(data['start'], data['end']))

    return data

def build_map(file_list):
    results = {}
    for item in file_list:
        key = Path(item).stem
        if key in results:
            raise ValueError(f"filenames must be unique, but got {item} as duplicate.")
        results[key] = item
    return results


def get_temp_manifest(audio_list, rttm_list):
    temp_manifest = []
    audio_file_map = build_map(audio_list)
    rttm_file_map = build_map(rttm_list)
    for key, val in audio_file_map.items():
        if key not in rttm_file_map:
            print(f"Key {key} not in rttm list, skipping...")
            continue

        item = {
            "audio_filepath": str(val),
        }
        temp_manifest.append(item)
    
    config = {
            'input': temp_manifest,
            'window_length_in_sec': window_length_in_sec,
            'split_duration': split_duration,
            'num_workers': num_workers,
            "prepared_manifest_vad_input": prepared_manifest_vad_input
        }
    
    manifest_vad_input = vad_utils.prepare_manifest(config)
    data = load_manifest(manifest_vad_input)

    results = []
    for item in data:
        key = Path(item["audio_filepath"]).stem
        item["rttm_file"] = rttm_file_map[key]
        results.append(item)
    return results


def merge_intervals(intervals: List[List[float]]) -> List[List[float]]:
    intervals.sort(key=lambda x: x[0])
    merged = []
    for interval in intervals:
        # if the list of merged intervals is empty or if the current
        # interval does not overlap with the previous, simply append it.
        if not merged or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
        # otherwise, there is overlap, so we merge the current and previous
        # intervals.
            merged[-1][1] = max(merged[-1][1], interval[1])
    return merged

def get_speech_segments(rttm_file):
    speech_segments = list(load_rttm_file(rttm_file)['segment'])
    speech_segments = [list(x) for x in speech_segments]
    speech_segments = merge_intervals(speech_segments)
    return speech_segments


def get_speech_segments_map(rttm_list):
    results = dict()
    for rttm_file in rttm_list:
        key = Path(rttm_file).stem
        results[key] = get_speech_segments(rttm_file)
    return results


def get_frame_labels(segments: List[List[float]], frame_length: float, offset: float, duration: float, sid: int = 0):
    labels = []
    n_frames = ceil(duration / frame_length)
    
    for i in range(n_frames):
        t = offset + i * frame_length
        while sid < len(segments) - 1 and segments[sid][1] < t:
            sid += 1
        if segments[sid][0] <= t <= segments[sid][1]:
            labels.append('1')
        else:
            labels.append('0')
    return ' '.join(labels), sid

def get_frame_manifest(manifest: List[Dict], segments_map: Dict, frame_length: float):
    results = []
    for item in tqdm(manifest):
        rttm_key = Path(item['rttm_file']).stem
        item['label'], _ = get_frame_labels(segments_map[rttm_key], frame_length, item['offset'], item['duration'])
        results.append(item)
    return results


In [28]:
data_root = Path("/media/data/datasets/vad_sd/")

data_name = "ch120"
data_dir = data_root / Path(data_name)
annotation_files_dir = data_dir / Path("list")

split = "CH109"
audio_list_file = annotation_files_dir / Path(f"audio_{split}list.txt")
rttm_list_file = annotation_files_dir / Path(f"rttm_{split}list.txt")

audio_list = load_file(audio_list_file)
rttm_list = load_file(rttm_list_file)
print(audio_list[:1])
print(rttm_list[:1])
audio_list = change_prefix_dir(audio_list, data_root)
rttm_list = change_prefix_dir(rttm_list, data_root)
print(audio_list[:1])
print(rttm_list[:1])

['/mnt/data/vad_sd/ch120/audio/en_4629.wav']
['/mnt/data/vad_sd/ch120/rttm/en_4629.rttm']
['/media/data/datasets/vad_sd/ch120/audio/en_4629.wav']
['/media/data/datasets/vad_sd/ch120/rttm/en_4629.rttm']


In [29]:
frame_length = 0.04
temp_manifest = get_temp_manifest(audio_list, rttm_list)
print(temp_manifest[0])
all_segments_map = get_speech_segments_map(rttm_list)
print(len(all_segments_map))
new_manifest = get_frame_manifest(temp_manifest, all_segments_map, frame_length)
save_manifest(f"{data_name}_{split}_40ms.json", new_manifest)

100%|█████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.69it/s]

[NeMo I 2022-10-04 14:12:59 vad_utils:89] The prepared manifest file exists. Overwriting!





{'audio_filepath': '/media/data/datasets/vad_sd/ch120/audio/en_4629.wav', 'duration': 20.0, 'label': 'infer', 'text': '_', 'offset': 0, 'rttm_file': '/media/data/datasets/vad_sd/ch120/rttm/en_4629.rttm'}
11


100%|█████████████████████████████████████████████████████████████████████| 329/329 [00:00<00:00, 7328.14it/s]


24


In [35]:
def change_data_prefix(filepath, src, dst):
    data = load_manifest(filepath)
    results = []
    for item in data:
        for key in ["audio_filepath", "rttm_file"]:
            if key in item and item[key].startswith(src):
                idx = item[key].index(src)
                new_path = Path(dst) / Path(item[key][idx + len(src) + 1:])
                item[key] = str(new_path)
        results.append(item)
    return results

In [37]:
for manifest in Path('./manifests_long/').glob('*.json'):
    if 'drc' in str(manifest):
        continue
    parent = manifest.parent
    output = Path(parent) / Path(f"{manifest.stem}_drc.json")
    data = change_data_prefix(manifest, '/media/data/datasets/vad_sd', '/data')
    save_manifest(output, data)
    print(f"saved {output}")


saved manifests_long/ami_eval_20ms_drc.json
saved manifests_long/ami_train_40ms_drc.json
saved manifests_long/ch120_CH109_40ms_drc.json
saved manifests_long/ami_dev_40ms_drc.json
saved manifests_long/icsi_all_40ms_drc.json
saved manifests_long/ch120_moved_40ms_drc.json
saved manifests_long/fisher_2005_40ms_drc.json
saved manifests_long/ami_eval_40ms_drc.json
saved manifests_long/fisher_2004_40ms_drc.json


# Process AVA Dataset

In [46]:
from collections import defaultdict
label_dir = Path("/media/data/datasets/ava/nnlabel")
label_files = list(label_dir.glob("*.label"))

all_labels_map = defaultdict(list)
for filepath in label_files:
    key = filepath.stem
    labels = []
    with Path(filepath).open("r") as fin:
        for line in fin.readlines():
            label = line.strip()
            if not label:
                continue
            if label == "NO_SPEECH":
                labels.append('0')
            else:
                labels.append('1')

[PosixPath('/media/data/datasets/ava/nnlabel/2PpxiG0WU18.label'), PosixPath('/media/data/datasets/ava/nnlabel/J4bt4y9ShTA.label'), PosixPath('/media/data/datasets/ava/nnlabel/AN07xQokfiE.label'), PosixPath('/media/data/datasets/ava/nnlabel/Ov0za6Xb1LM.label'), PosixPath('/media/data/datasets/ava/nnlabel/K_SpqDJnlps.label')]


24
