This notebook is an adaptation of a wonderful last-year BirdCLEF competition notebook by kkiller:
https://www.kaggle.com/kneroma/birdclef-mels-computer-public/

**Please upvote it as well!**


I just fixed a few things and reformatted it a little.

This notebook allows you to compute melspectrograms to speed up training of your models.

## Import packages

In [None]:
import numpy as np
import pandas as pd
import json
import soundfile as sf
import librosa as lb
import librosa.display as lbd
from pathlib import Path
from soundfile import SoundFile
from joblib import Parallel, delayed
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm

## Config

In [None]:
DATA_DIR = Path("../input/birdclef-2022")
SR = 32_000   # sample rate
DURATION = 7  # max duration of sound
SEED = 42

TRAIN_AUDIO_DIR = DATA_DIR/"train_audio"
TRAIN_AUDIO_IMAGES_SAVE_DIR = Path("audio_images") # save the melspectrograms to here
TRAIN_AUDIO_IMAGES_SAVE_DIR.mkdir(exist_ok=True, parents=True)

## Prepare training dataset

In [None]:
def get_audio_info(filepath):
    """Get some properties from  an audio file"""
    with SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames)/sr
    return {"frames": frames, "sr": sr, "duration": duration}

In [None]:
def make_df(n_splits=5, seed=SEED, nrows=None):
    df = pd.read_csv(DATA_DIR/"train_metadata.csv", nrows=nrows)
    LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df["primary_label"].unique()))}
    df["label_id"] = df["primary_label"].map(LABEL_IDS)
    df["filepath"] = [str(TRAIN_AUDIO_DIR/filename) for filename in df.filename]
    pool = Parallel(-1)
    mapper = delayed(get_audio_info)
    tasks = [mapper(filepath) for filepath in df.filepath]

    df = pd.concat([df, pd.DataFrame(pool(tqdm(tasks)))], axis=1, sort=False)
    
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    splits = skf.split(np.arange(len(df)), y=df.label_id.values)
    df["fold"] = -1
    for fold, (train_set, val_set) in enumerate(splits):
        df.loc[df.index[val_set], "fold"] = fold

    return LABEL_IDS, df

Process train dataframe and split it to folds

In [None]:
LABEL_IDS, df = make_df(nrows=None)

df.to_csv("rich_train_metadata.csv", index=True)
with open("LABEL_IDS.json", "w") as f:
    json.dump(LABEL_IDS, f)

print(df.shape)
df.head()

Let's see how many elements each fold has.

In [None]:
df["fold"].value_counts()

## Helper functions

In [None]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y=y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

In [None]:
class AudioToImage:
    def __init__(self, sr=SR, n_mels=128, fmin=0, fmax=None, 
                 duration=DURATION, step=None, res_type="kaiser_fast", 
                 resample=True):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin,
                                                 fmax=self.fmax)
        
    def audio_to_image(self, audio):
        melspec = self.mel_spec_computer(audio) 
        image = mono_to_color(melspec)
#         image = normalize(image, mean=None, std=None)
        return image

    def __call__(self, row, save=True):
        max_audio_duration = 10*self.duration
        init_audio_length = max_audio_duration*row.sr
        
        start = 0 if row.duration <  max_audio_duration else np.random.randint(row.frames - init_audio_length)
    
        audio, orig_sr = sf.read(row.filepath, start=start, stop=start+init_audio_length, dtype="float32")
        
        # convert sound to mono if stereo
        if len(audio.shape) > 1:
            audio = lb.to_mono(audio)

        if self.resample and orig_sr != self.sr:
            audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)
        
        audios = [audio[i:i+self.audio_length] for i in range(0, max(1, len(audio) - self.audio_length + 1), self.step)]
        audios[-1] = crop_or_pad(audios[-1] , length=self.audio_length)
        images = np.stack([self.audio_to_image(audio) for audio in audios])
        
        if save:
            path = TRAIN_AUDIO_IMAGES_SAVE_DIR/f"{row.filename}.npy"
            path.parent.mkdir(exist_ok=True, parents=True)
            np.save(str(path), images)
        else:
            return row.filename, images

## Convert audio to melspectrograms and save them as images

In [None]:
def get_audios_as_images(df):
    pool = Parallel(-1) 
    converter = AudioToImage(step=int(DURATION*0.666*SR))
    mapper = delayed(converter)
    tasks = [mapper(row) for row in df.itertuples(False)]
    pool(tqdm(tasks))

In [None]:
get_audios_as_images(df)

In [None]:
row = df.loc[df.duration.idxmax()]
mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_DIR/row.filename).as_posix() + ".npy"))
print(mels.shape)

## Show one melspectrogram

Let's show one melspectrogram as an example

In [None]:
lbd.specshow(mels[0])