In [1]:
from pathlib import Path
import json
import librosa
import math
from typing import List
from tqdm import tqdm
from multiprocessing import Pool
max_duration = 20.0

def load_manifest(filepath):
    data = []
    with Path(filepath).open("r") as fin:
        for line in fin.readlines():
            line = line.strip()
            if not line:
                continue
            data.append(json.loads(line))
    return data

def get_durations(data):
    results = []
    for item in data:
        results.append(item["duration"])
    return results

def save_manifest(data, filepath):
    with Path(filepath).open("w") as fout:
        for item in data:
            fout.write(f"{json.dumps(item)}\n")

def get_labels(data):
    results = []
    for item in data:
        labels = item['label'].split()
        results.append(labels)
    return results

In [2]:
musan_path = Path("/media/data/datasets/vad_sd/musan")
freesound_path = Path("/media/data/datasets/vad_sd/freesound")

In [3]:
musan_files = list(musan_path.glob("**/*.wav"))
print(len(musan_files))
freesound_files = list(freesound_path.glob("**/*.wav"))
print(len(freesound_files))

1590
7741


In [4]:
def build_manifest(audio_files: List[Path], max_duration: float, sample_rate: int = 16000, frame_length: float = 0.01) -> List[dict]:
    data = []
    for audio_file in tqdm(audio_files, total=len(audio_files)):
        if not audio_file.exists():
            print(f"File not found: {audio_file}")
            continue
        y, orig_sr = librosa.load(audio_file)
        y = librosa.resample(y, orig_sr=orig_sr, target_sr=sample_rate)
        total_duration = librosa.get_duration(y=y, sr=sample_rate)
        offset = 0.0
        N = math.ceil(total_duration / max_duration)
        for i in range(N):
            if N == 1:
                dur = total_duration
            elif i == N - 1:
                dur = total_duration % max_duration
            else:
                dur = max_duration
            label = " ".join(["0"] * math.ceil(dur / frame_length))
            entry = {
                "audio_filepath": str(audio_file.absolute()),
                "text": "_",
                "offset": offset,
                "duration": dur,
                "label": label, 
            }
            data.append(entry)
            offset += max_duration
    return data

def build_noise_aug_manifest(audio_files: List[Path]) -> List[dict]:
    data = []
    for audio_file in tqdm(audio_files, total=len(audio_files)):
        if not audio_file.exists():
            print(f"File not found: {audio_file}")
            continue
        y, orig_sr = librosa.load(audio_file)
        duration = librosa.get_duration(y=y, sr=orig_sr)
        offset = 0.0
        entry = {
            "audio_filepath": str(audio_file.absolute()),
            "text": "_",
            "offset": 0.0,
            "duration": duration,
            "label": "background", 
        }
        data.append(entry)
    return data

        

In [18]:
musan_manifest = build_manifest(musan_files, max_duration=max_duration)

100%|██████████| 1590/1590 [15:33<00:00,  1.70it/s]


In [19]:
save_manifest(musan_manifest, "musan_all.json")

In [30]:
def process_fn(x):
    audio_file, max_duration, sample_rate, frame_length = x
    res = []
    if not audio_file.exists():
        print(f"File not found: {audio_file}")
        return []
    y, orig_sr = librosa.load(audio_file)
    y = librosa.resample(y, orig_sr=orig_sr, target_sr=sample_rate)
    total_duration = librosa.get_duration(y=y, sr=sample_rate)
    offset = 0.0
    N = math.ceil(total_duration / max_duration)
    for i in range(N):
        if N == 1:
            dur = total_duration
        elif i == N - 1:
            dur = total_duration % max_duration
        else:
            dur = max_duration
        label = " ".join(["0"] * math.ceil(dur / frame_length))
        entry = {
            "audio_filepath": str(audio_file.absolute()),
            "text": "_",
            "offset": offset,
            "duration": dur,
            "label": label, 
        }
        res.append(entry)
        offset += max_duration
    return res

def build_manifest_mp(audio_files: List[Path], max_duration: float, sample_rate: int = 16000, frame_length: float = 0.01) -> List[dict]:
    data = []
    queue = []
    for afile in audio_files:
        queue.append((afile, max_duration, sample_rate, frame_length))
    with Pool(processes=20) as pool:
        results = list(tqdm(pool.imap(process_fn, queue), total=len(queue)))
    
    for res in results:
        data += res
    return data


In [31]:
freesound_manifest = build_manifest_mp(freesound_files, max_duration=max_duration)

100%|██████████| 7741/7741 [1:10:19<00:00,  1.83it/s]


In [33]:
save_manifest(freesound_manifest, "freesound_all.json")

In [5]:
musan_manifest2 = build_noise_aug_manifest(musan_files)
save_manifest(musan_manifest2, "noise_musan_all.json")

100%|██████████| 1590/1590 [06:29<00:00,  4.09it/s]


In [7]:
def process_fn2(x):
    audio_file = x
    if not audio_file.exists():
        print(f"File not found: {audio_file}")
        return {}
    y, orig_sr = librosa.load(audio_file)
    duration = librosa.get_duration(y=y, sr=orig_sr)
    entry = {
        "audio_filepath": str(audio_file.absolute()),
        "text": "_",
        "offset": 0.0,
        "duration": duration,
        "label": "background", 
    }
    return entry

def build_manifest_mp2(audio_files: List[Path]) -> List[dict]:
    with Pool(processes=20) as pool:
        results = list(tqdm(pool.imap(process_fn2, audio_files), total=len(audio_files)))
    return results

freesound_manifest2 = build_manifest_mp2(freesound_files)
save_manifest(freesound_manifest2, "noise_freesound_all.json")

100%|██████████| 7741/7741 [27:15<00:00,  4.73it/s]  
