In [40]:
import os
import torch
import torch.nn.functional as F
import torchaudio
import numpy as np
from model.stft import AudioPreprocessor
from tqdm import tqdm

In [31]:
DATASET = ".\\Dataset"
OUTPUT = ".\\calibration"

SAMPLE_RATE = 16000
DURATION = 1
NUM_SAMPLES = SAMPLE_RATE * DURATION
WINDOW_SIZE = 512
HOP_SIZE = 160
MEL_BINS = 64
FMIN = 50
FMAX = 8000

In [32]:
os.makedirs(OUTPUT, exist_ok=True)

audio_preprocess = AudioPreprocessor(
    sample_rate=SAMPLE_RATE,
    window_size=WINDOW_SIZE,
    hop_size=HOP_SIZE,
    mel_bins=MEL_BINS,
    fmin=FMIN,
    fmax=FMAX
)

In [33]:
def convert_logmel(file_path):
    waveform, sr = torchaudio.load(file_path)
    
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sr != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
        waveform = resampler(waveform)

    if waveform.shape[1] < NUM_SAMPLES:
        waveform = F.pad(waveform, (0, NUM_SAMPLES - waveform.shape[1]))
    else:
        waveform = waveform[:, :NUM_SAMPLES]

    waveform = waveform / (waveform.abs().max() + 1e-9)

    with torch.no_grad():
        logmel = audio_preprocess(waveform)

        if logmel.shape[2] < 101:
            logmel = F.pad(logmel, (0, 101 - logmel.shape[2]))

        elif logmel.shape[2] > 101:
            logmel = logmel[:, :, :101]

    return logmel.numpy()

In [34]:
filepaths = []

for root, _, files in os.walk(DATASET):
    
    for file in files:
        if file.lower().endswith(".wav"):
            filepaths.append(os.path.join(root, file))

print(filepaths)

['.\\Dataset\\danger\\1_danger_alarm_0.wav', '.\\Dataset\\danger\\1_danger_alarm_1.wav', '.\\Dataset\\danger\\1_danger_alarm_10.wav', '.\\Dataset\\danger\\1_danger_alarm_100.wav', '.\\Dataset\\danger\\1_danger_alarm_101.wav', '.\\Dataset\\danger\\1_danger_alarm_102.wav', '.\\Dataset\\danger\\1_danger_alarm_103.wav', '.\\Dataset\\danger\\1_danger_alarm_104.wav', '.\\Dataset\\danger\\1_danger_alarm_105.wav', '.\\Dataset\\danger\\1_danger_alarm_106.wav', '.\\Dataset\\danger\\1_danger_alarm_107.wav', '.\\Dataset\\danger\\1_danger_alarm_108.wav', '.\\Dataset\\danger\\1_danger_alarm_109.wav', '.\\Dataset\\danger\\1_danger_alarm_11.wav', '.\\Dataset\\danger\\1_danger_alarm_110.wav', '.\\Dataset\\danger\\1_danger_alarm_111.wav', '.\\Dataset\\danger\\1_danger_alarm_112.wav', '.\\Dataset\\danger\\1_danger_alarm_113.wav', '.\\Dataset\\danger\\1_danger_alarm_114.wav', '.\\Dataset\\danger\\1_danger_alarm_115.wav', '.\\Dataset\\danger\\1_danger_alarm_116.wav', '.\\Dataset\\danger\\1_danger_alarm_117

In [42]:
for path in tqdm(filepaths, desc="file convert", leave=True):
    file = path.split("\\")[-1]
    # print(file)
    data = convert_logmel(path)
    # print(data.shape)
    output = os.path.join(OUTPUT, file.replace(".wav", ".npy"))
    # print(output)
    np.save(output, data)
    # break

file convert: 100%|██████████| 3000/3000 [00:05<00:00, 574.67it/s]
