In [None]:
# Parameters
input_folder = None
feature_file = None
target_file = None

chunk_length = None
chunk_step = None
target_sr = None
n_fft = None
hop_length = None
n_mels = None

In [1]:
import os
import IPython.display as ipd
import time
import librosa
import numpy as np

In [None]:
chunk_length = int(chunk_length)
chunk_step = int(chunk_step)
target_sr = int(target_sr)
n_fft = int(n_fft)
hop_length = int(hop_length)
n_mels = int(n_mels)

In [None]:
sound_types = ['bird','dog','rain']

In [2]:
def stft_and_chunk(audio, 
                   chunk_length=5,
                   chunk_step=1,
                   sr=16000,
                   n_fft=2048,
                   hop_length=512,
                   ):
    """
    Compute the Short-Time Fourier Transform (STFT) of an audio file and chunk it into segments.
    Parameters:
        audio (np.ndarray): Audio time series.
        chunk_length (int): Length of each chunk in seconds.
        chunk_step (int): Step size between chunks in seconds.
        sr (int): Sampling rate of the audio file.
        n_fft (int): FFT window size.
        hop_length (int): Number of samples between successive frames.
        sr (int, optional): Sampling rate. If None, it will be inferred from the audio file.
    Returns:
        List of STFT chunks (np.ndarray).
    """

    # Calculate chunk size in frames
    frames_per_chunk = int(chunk_length * sr / hop_length)
    steps_per_chunk = int(chunk_step * sr / hop_length)

    # Initialize list to store STFT chunks
    stft_chunks = []

    # Loop through the STFT in chunks
    stft = np.abs(librosa.stft(audio, n_fft=n_fft, hop_length=hop_length))
    for i in range(0, stft.shape[1], steps_per_chunk):

        # Extract chunk
        chunk = stft[:,i:i + frames_per_chunk]

        # Pad chunk if necessary
        if chunk.shape[1] < frames_per_chunk:
              chunk = np.pad(chunk, ((0, 0), (0, frames_per_chunk - chunk.shape[1])))

        # Append STFT chunk to list
        stft_chunks.append(stft)

    return stft_chunks

def melspectrogram_and_chunk(audio, 
                             chunk_length=5, 
                             chunk_step=1, 
                             sr=16000, 
                             n_fft=2048, 
                             hop_length=512, 
                             n_mels=128):
    """
    Compute the melspectrogram of an audio signal and chunk it into segments.

    Parameters:
        audio (np.ndarray): Audio time series.
        chunk_length (int): Length of each chunk in seconds.
        chunk_step (int): Step size between chunks in seconds.
        sr (int): Sampling rate.
        n_fft (int): FFT window size.
        hop_length (int): Number of samples between successive frames.
        n_mels (int): Number of Mel bands.

    Returns:
        List of melspectrogram chunks (np.ndarray).
    """
    # Compute melspectrogram
    mel = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
    )

    # Calculate chunk size in frames
    frames_per_chunk = int(chunk_length * sr / hop_length)
    steps_per_chunk = int(chunk_step * sr / hop_length)

    # Split melspectrogram into chunks
    mel_chunks = []
    for i in range(0, mel.shape[1], steps_per_chunk):
        chunk = mel[:, i:i + frames_per_chunk]
        if chunk.shape[1] < frames_per_chunk:
            # Pad last chunk if necessary
            chunk = np.pad(chunk, ((0, 0), (0, frames_per_chunk - chunk.shape[1])))
        mel_chunks.append(chunk)

    return mel_chunks

In [5]:
# Set up the folder
idx = 10
folder = input_folder + sound_types[0]
files = os.listdir(folder)
wav_files = [f for f in files if f.endswith('.wav')]
yml_files = [f for f in files if f.endswith('.yml')]
file = folder + wav_files[idx]

# Make recording playable
# ipd.Audio(file)

In [6]:
folder = input_folder

loading_time = 0
mel_time = 0

target_sr = target_sr

y = []
Z = []

for _ in sound_types:
    print('Processing sound type:', _)
    folder2 = folder + _ + '/'
    files = os.listdir(folder2)
    wav_files = [folder2 + f for f in files if f.endswith('.wav')]
    for file in wav_files:
        
        # Load audio file
        start_time = time.time()
        audio, sr = librosa.load(file, sr=None)
        audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        loading_time += time.time() - start_time

        # Calculate melspectrogram and chunk it
        start_time = time.time()
        mel_chunks = melspectrogram_and_chunk(audio, chunk_length, chunk_step, sr=target_sr)
        mel_time += time.time() - start_time

        for mel in mel_chunks[:-1]: # Exclude zero-padded last chunk
            Z.append(mel)

print(f'Loading time: {loading_time:.2f} seconds'   )
print(f'Mel spectrogram computation time: {mel_time:.2f} seconds')

Processing sound type: bird
Processing sound type: dog
Processing sound type: rain
Loading time: 21.04 seconds
STFT computation time: 6.27 seconds
Mel spectrogram computation time: 9.08 seconds


In [7]:
Z = np.array(Z)
y = np.array(y)
np.save(feature_file, Z)
np.save(target_file, y)