# Data Preprocessing

- resampling & normalization
- data augmentation (noise addition, pitch shift, time stretching)

In [56]:
from utility_data import *

In [57]:
audio_params = {'sample_rate': 32000, 'n_fft': 1024, 'hop_length': 501, 'n_mfcc': 128, 'n_mels': 128, 'feature_size': 2048}

dataset = AudioDataset(
    datafolder="data",
    metadata_csv="train.csv",
    audio_dir="train_audio",
    feature_mode='mel',
    audio_params=audio_params,
    metadata=True
)

AudioDataset can produce a new dataset '.csv' and '/train' with the correct, preprocessed data, which can be reused by loading the data directly to the gpu, without reprocessing every time, which is a major bottleneck to training.

In [None]:
import math
import time
import soundfile

wav_sec = 5
sample_rate = 32000
segment = sample_rate * wav_sec

root_path = 'data/'
input_path = root_path + 'train_audio/'
output_path = '.' + f'/{root_path}train_raw{wav_sec}/'
metadata_path = root_path + 'train.csv'
backend='soundfile'

ta_metadata = pd.read_csv(metadata_path)


def crop_and_save(folder:str, transform, filename:str) -> list:
    """
    Load every file indicated by the ta_metadata file, 
    split into segments, and save each segment as a separate file.
    The last segment is padded with zeroes if needed.
    Returns a list of filenames of exported files
    """
    
    filepath = folder + filename

    try:
        sig, _ = torchaudio.load(filepath, backend=backend)
        total_len = sig.shape[1]
        num_segments = math.ceil(total_len / segment)

        dir_name = os.path.dirname(filename)
        base_name, _ = os.path.basename(filename).rsplit('.', 1)
        os.makedirs(os.path.join(output_path, dir_name), exist_ok=True)

        filenames = []
        for i in range(num_segments):
            start = i * segment
            end = start + segment

            # If this is the last segment
            if end >= total_len:
                remaining = total_len - start

                # If the remaining is less than half a segment,
                # shift window to capture the last `segment` samples
                if remaining > segment // 2:
                    start = max(0, total_len - segment)
                    end = total_len

            segment_data = sig[:, start:end]

            # If needed (very rare), pad short segment to full length
            if segment_data.shape[1] < segment:
                pad_amount = segment - segment_data.shape[1]
                segment_data = torch.cat([segment_data, torch.zeros(1, pad_amount)], dim=1)

            segment_filename = os.path.join(dir_name, f'{base_name}_{i}.pt')
            segment_path = output_path + segment_filename
            
            spectrogram = transform(segment_data)
            
            torch.save(spectrogram, segment_path) 

            filenames += [segment_filename]

        return filenames

    except Exception as e:
        print(f'Error processing {filename}: {e}')
        return []

In [85]:
import torchaudio
mel_transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)

examples = ta_metadata.head().copy()

examples.loc[:, 'idx'] = examples.index

# Apply crop_and_save to each row and set the filename column as a list value
examples.loc[:, 'filename'] = examples.apply(lambda row : crop_and_save(input_path, mel_transform, row['filename']), axis=1)

# Explode the 'filename' column to create a new row for each filename
examples = examples.explode('filename', ignore_index=True)



Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license,idx
0,1139490,[''],[''],1139490/CSA36385_0.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,0
1,1139490,[''],[''],1139490/CSA36385_1.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,0
2,1139490,[''],[''],1139490/CSA36385_2.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,0
3,1139490,[''],[''],1139490/CSA36385_3.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,0
4,1139490,[''],[''],1139490/CSA36385_4.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,1192948,[''],[''],1192948/CSA36373_16.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,4
103,1192948,[''],[''],1192948/CSA36373_17.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,4
104,1192948,[''],[''],1192948/CSA36373_18.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,4
105,1192948,[''],[''],1192948/CSA36373_19.pt,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0,4
