## This code processes audio data into tensors

In [None]:
print()
import os
import time
# common math imports
import numpy as np
# common audio imports
import librosa
# common torch imports
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
input_folder = None
output_folder = None
params_file = None

In [None]:
print()
if os.path.exists(output_folder):
    print(f"The output folder '{output_folder}' exists.")
else:
    print(f"The output folder '{output_folder}' does not exist.")
    os.makedirs(output_folder)
    print(f"Created the output folder '{output_folder}'.")

In [None]:
print()
# process data in json file
import json
with open(params_file) as f:
    params = json.load(f)
alphabet = str(params['tensorize']['alphabet'])
shift_std = float(params['tensorize']['shift_std'])
frame_duration = float(params['tensorize']['frame_duration'])
hop_scalar = float(params['tensorize']['hop_scalar'])
rate_present = float(params['tensorize']['rate_present'])
num_seconds = float(params['tensorize']['num_seconds'])
max_kbps = float(params['tensorize']['max_kbps'])
n_mels = int(params['tensorize']['n_mels'])
sr_all = int(params['tensorize']['sr_all'])

# manipulate the data for intermediate variables
frame_length = int(frame_duration * 32_000)
hop_length = int(frame_length * hop_scalar)
max_size_bytes = int(max_kbps * 1024)
frames_per_s = int((sr_all * num_seconds) // hop_length)

# ignore this aspect of code for rnns
# could back into that format later
sequence_size = 1

if alphabet == "":
    alphabet = 'abcdefghijklmnopqrstuvwxyz'

In [None]:
print()
# Find all subfolders (bird species) in the main train_audio directory
bird_folders = [os.path.join(input_folder, d) for d in os.listdir(input_folder)
                if os.path.isdir(os.path.join(input_folder, d))]
bird_names = [os.path.basename(d) for d in bird_folders]
bird_names = np.unique(bird_names).tolist()
bird_names = ['no_bird'] + bird_names
birdname_to_label = {name: idx for idx, name in enumerate(bird_names)}

['no_bird',
 'apapan',
 'arcter',
 'bcnher',
 'belkin1',
 'dunlin',
 'elepai',
 'ercfra',
 'eurwig',
 'fragul']

In [None]:
start_time = time.time()
curr_time = time.time()
print()
print('Tensorizing all birds')
print('----------')
print()

y = []
X = []
z = []
W = []
for bird_folder in bird_folders:
    bird_name = os.path.basename(bird_folder)
    files = [f for f in os.listdir(bird_folder) if f.endswith('.ogg')]

    if bird_name[0] in alphabet:
        
        print(bird_name.strip())

        for file in files:

            file_path = os.path.join(bird_folder, file)
            # do not load if the file is too large
            if os.path.getsize(file_path) > max_size_bytes:
                continue

            # load and process the file here
            audio, sr = librosa.load(file_path, sr=None)

            # what are the dangers of me downsampling to 32kHz?
            if sr != sr_all:
                continue

            # check if the audio has no bird
            shift_amplitude = np.std(audio) * shift_std
            zcr = librosa.feature.zero_crossing_rate(audio + shift_amplitude,
                                                    frame_length=frame_length,
                                                    hop_length=hop_length,
                                                    ).squeeze()
            W.append(zcr.tolist())
            zcr_present = zcr > rate_present
            window_size = int(num_seconds / frame_duration)
            try:
                pad_length = (-len(zcr_present)) % window_size
                if pad_length > 0:
                    zcr_present = np.pad(zcr_present, (0, pad_length), constant_values=False)
                zcr_windowed = np.add.reduceat(zcr_present, np.arange(0, len(zcr), window_size))
                labels_present = np.where(zcr_windowed > 0, bird_name, 'no_bird')
            except:
                # if the audio is too short, just skip it
                print(f"Skipping {file_path} due to length issues.")
                break

            # convert to categorical labels
            labels_categorical = np.array([birdname_to_label[label] for label in labels_present])

            # compute the Mel spectrogram for the audio
            S = librosa.feature.melspectrogram(y=audio, 
                                            sr=sr, 
                                            n_mels=n_mels, 
                                            fmax=None, 
                                            hop_length=hop_length)

            # pad S with zeros if needed so its second dimension is a multiple of frames_per_5s
            num_frames = S.shape[1]
            pad_width = (-num_frames) % frames_per_s
            if pad_width > 0:
                S = np.pad(S, ((0, 0), (0, pad_width)), mode='constant')
                
            # slice the Mel spectrogram into 5-second units
            mel_slices = []
            num_slices = S.shape[1] // frames_per_s
            for i in range(num_slices):
                start = i * frames_per_s
                end = start + frames_per_s
                mel_slices.append(S[:, start:end])
            mel_slices = np.array(mel_slices, dtype=np.float32)

            # check and manipulate sizes of the audio file
            curr_len = len(labels_categorical)
            addi_len = sequence_size - curr_len

            # The audio file is shorter than the sequence size
            # Pad the labels and mel_slices to match the sequence size
            if addi_len > 0:
                labels_categorical = np.pad(labels_categorical, (0, addi_len), constant_values=0)
                mel_slices = np.pad(mel_slices, ((0, addi_len), (0, 0), (0, 0)), mode='constant')
                y.append(labels_categorical)
                X.append(mel_slices)
                z.append(file_path)

            # The audio file exactly fits the sequence size
            elif addi_len == 0:
                y.append(labels_categorical)
                X.append(mel_slices)
                z.append(file_path)

            # The audio file is longer than the sequence size
            # Split the labels and mel_slices into chunks of sequence size
            # and add them to the batch
            elif addi_len < 0:
                labels_categorical_many = np.array_split(labels_categorical, np.arange(sequence_size, len(labels_categorical), sequence_size))
                mel_slices_many = np.array_split(mel_slices, np.arange(sequence_size, len(mel_slices), sequence_size))

                # Pad the last split if it's shorter than sequence_size
                if len(labels_categorical_many[-1]) < sequence_size:
                    pad_len = sequence_size - len(labels_categorical_many[-1])
                    labels_categorical_many[-1] = np.pad(labels_categorical_many[-1], (0, pad_len), constant_values=0)
                    mel_slices_many[-1] = np.pad(mel_slices_many[-1], ((0, pad_len), (0, 0), (0, 0)), mode='constant')
                    
                for lbls, mels in zip(labels_categorical_many, mel_slices_many):
                    y.append(lbls)
                    X.append(mels)
                    z.append(file_path)

        next_time = time.time()
        print(f'Bird processed in {next_time - curr_time} seconds.')
        print('----------')
        print()
        curr_time = next_time

end_time = time.time()
print(f'All birds processed in {end_time - start_time} seconds.')
print('----------')

# Save data as tensors
y = np.array(y, dtype=np.int64)
X = np.array(X, dtype=np.float32)
y_tensor = torch.tensor(y, dtype=torch.uint8)
X_tensor = torch.tensor(X, dtype=torch.float16)



apapan
arcter
bcnher
belkin1
dunlin
elepai
ercfra
eurwig
fragul


In [None]:
print()
# saving the data
# bird names for categories
with open(output_folder + "/birds.csv",'w') as f:
    for i in range(len(bird_names)):
        f.write(f"{i},{bird_names[i]}\n")
# where the files came from
with open(output_folder + "/files.csv","w") as f:
    for _ in z:
        f.write(f"{_}\n")
# the zero crossing rates
# which will not be aligned with the mel spectrograms
# but will be useful for debugging
with open(output_folder + "/zc_rates.csv","w") as f:
    for w in W:
        try:
            for _ in w:
                f.write(f"{round(_,4)},")
            f.write("\n")
        except:
            f.write(f'{round(w,4)},\n')

# X has shape (batch_size, height, width)
X_cnn = torch.flatten(X_tensor, end_dim=1).unsqueeze(1)
y_cnn = torch.flatten(y_tensor, end_dim=1)
torch.save(X_cnn, output_folder + '/X_tensor.pt')
torch.save(y_cnn, output_folder + '/y_tensor.pt')