In [66]:
import os
import torch
import torch.nn.functional as F
import torchaudio
import numpy as np
from model.stft import AudioPreprocessor

In [67]:
DATASET = "./calibration"
OUTPUT = "./calibration_preprocess"

SAMPLE_RATE = 16000
DURATION = 1
NUM_SAMPLES = SAMPLE_RATE * DURATION
WINDOW_SIZE = 512
HOP_SIZE = 160
MEL_BINS = 64
FMIN = 50
FMAX = 8000

In [68]:
os.makedirs(OUTPUT, exist_ok=True)

audio_preprocess = AudioPreprocessor(
    sample_rate=SAMPLE_RATE,
    window_size=WINDOW_SIZE,
    hop_size=HOP_SIZE,
    mel_bins=MEL_BINS,
    fmin=FMIN,
    fmax=FMAX
)

In [69]:
def convert_logmel(file_path):
    waveform, sr = torchaudio.load(file_path)
    
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sr != SAMPLE_RATE:
        resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
        waveform = resampler(waveform)

    if waveform.shape[1] < NUM_SAMPLES:
        waveform = F.pad(waveform, (0, NUM_SAMPLES - waveform.shape[1]))
    else:
        waveform = waveform[:, :NUM_SAMPLES]

    waveform = waveform / (waveform.abs().max() + 1e-9)

    with torch.no_grad():
        logmel = audio_preprocess(waveform)

        if logmel.shape[2] < 101:
            logmel = F.pad(logmel, (0, 101 - logmel.shape[2]))

        elif logmel.shape[2] > 101:
            logmel = logmel[:, :, :101]

    return logmel.numpy()

In [70]:
for file in os.listdir(DATASET):
    path = os.path.join(DATASET, file)
    data = convert_logmel(path)
    print(data.shape)
    np.save(os.path.join(OUTPUT, file.replace(".wav", ".npy")), data)
    # break

(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 101, 64)
(1, 1, 1