# Notes

I've released a [training kernel](https://www.kaggle.com/kneroma/clean-fast-simple-bird-identifier-training-colab) and an [inference kernel](https://www.kaggle.com/kneroma/clean-fast-simple-bird-identifier-inference). Both of these kernels use a set of pre-computed mels. Which can be found at:

* https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part1
* https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part2
* https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part3
* https://www.kaggle.com/kneroma/kkiller-birdclef-mels-computer-d7-part4

Unfortunately, these mels are static (7s audio extracts) and you can't get any customization from them. Here, I'm releasing the base kernel that generate them in order to allow everyone to be able to play with the params.

In [None]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

from matplotlib import pyplot as plt

from tqdm.notebook import tqdm
import joblib, json

from  sklearn.model_selection  import StratifiedKFold

In [None]:
data = pd.read_csv("../input/aicovid-final/new_full_data/data_2.csv")
data

In [None]:
PART_ID = 0 # The start index in the below list, by changing it you will compute mels on another subset
PART_INDEXES = [0,8000, 4000, 6619] # The train_set is splitted into 4 subsets

In [None]:
SR = 22050
DURATION = 7 
SEED = 666
DATA_ROOT = Path("../input/aicovid-1/aicv115m_final_public_train")
TRAIN_AUDIO_IMAGES_SAVE_ROOT = Path("audio_images") # Where to save the mels images
TRAIN_AUDIO_IMAGES_SAVE_ROOT.mkdir(exist_ok=True, parents=True)

In [None]:
def get_audio_info(filepath):
    """Get some properties from  an audio file"""
    with SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames)/sr
    return {"frames": frames, "sr": sr, "duration": duration}

In [None]:
def make_df(n_splits=5, seed=SEED, nrows=None):
    
    df = pd.read_csv(DATA_ROOT/"public_train_metadata.csv", nrows=nrows)
#     LABEL_IDS = {'unknown':-1,'0':0,'1':1,'0.5':-1}
#     LABEL_IDS = {label: label_id for label_id,label in enumerate(sorted(df["assessment_result"].unique()))}
    df["label_id"] = df["assessment_result"]
    
    df = df.iloc[PART_INDEXES[PART_ID]: PART_INDEXES[PART_ID+1]]

#     df["label_id"] = df["assessment_result"].map(LABEL_IDS)

    df["filepath"] ="../input/aicovid-1/aicv115m_final_public_train/public_train_audio_files/" + df.uuid + ".wav" 

    pool = joblib.Parallel(4)
    mapper = joblib.delayed(get_audio_info)
    tasks = [mapper(filepath) for filepath in df.filepath]

    df = pd.concat([df, pd.DataFrame(pool(tqdm(tasks)))], axis=1, sort=False)
    
    skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
    splits = skf.split(np.arange(len(df)), y=df.label_id.values)
    df["fold"] = -1

    for fold, (train_set, val_set) in enumerate(splits):
        
        df.loc[df.index[val_set], "fold"] = fold

    return  df

In [None]:
 df = make_df(nrows=None)



In [None]:
df.to_csv("rich_train_metadata.csv", index=True)
print(df.shape)
df.head()

In [None]:
df["fold"].value_counts()

In [None]:
df["label_id"].value_counts()

In [None]:
df["duration"].hist(bins=20)

In [None]:
df["duration"].quantile(np.arange(0, 1, 0.01)).plot()

In [None]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax, **kwargs):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax
        kwargs["n_fft"] = kwargs.get("n_fft", self.sr//10)
        kwargs["hop_length"] = kwargs.get("hop_length", self.sr//(10*4))
        self.kwargs = kwargs

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, **self.kwargs,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

In [None]:
class AudioToImage:
    def __init__(self, sr=SR, n_mels=128, fmin=0, fmax=None, duration=DURATION, step=None, res_type="kaiser_fast", resample=True):

        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin,
                                                 fmax=self.fmax)
        
    def audio_to_image(self, audio,orig_sr):
        mel_spec_computer = MelSpecComputer(sr=orig_sr, n_mels=self.n_mels, fmin=self.fmin,
                                                 fmax=orig_sr//2)
        melspec = mel_spec_computer(audio) 
        image = mono_to_color(melspec)
#         image = normalize(image, mean=None, std=None)
        return image

    def __call__(self, row, save=True):
#       max_audio_duration = 10*self.duration
#       init_audio_length = max_audio_duration*row.sr
        
#       start = 0 if row.duration <  max_audio_duration else np.random.randint(row.frames - init_audio_length)
    
      audio, orig_sr = sf.read(row.filepath, dtype="float32")
        
#       if self.resample and orig_sr != self.sr:
#         audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)
      audio_length=self.duration*orig_sr
      step = int(0.66 * audio_length)
      audios = [audio[i:i+audio_length] for i in range(0, max(1, len(audio) - audio_length + 1), step)]
      audios[-1] = crop_or_pad(audios[-1] , length=audio_length)
      images = [self.audio_to_image(audio,orig_sr ) for audio in audios]
      images = np.stack(images)
        
      if save:
        path = TRAIN_AUDIO_IMAGES_SAVE_ROOT/f"{row.uuid}/{row.uuid}.npy"
        path.parent.mkdir(exist_ok=True, parents=True)
        np.save(str(path), images)
      else:
        return  row.filename, images

In [None]:
def get_audios_as_images(df):
    pool = joblib.Parallel(2)
    
    converter = AudioToImage(step=int(DURATION*0.666*SR))
    mapper = joblib.delayed(converter)
    tasks = [mapper(row) for row in df.itertuples(False)]
    
    pool(tqdm(tasks))

In [None]:
get_audios_as_images(df)

In [None]:
# !rm -rf /kaggle/working/audio_images

In [None]:
df

In [None]:
row = df.loc[2]
mels = np.load(str((TRAIN_AUDIO_IMAGES_SAVE_ROOT/row.uuid/row.uuid).as_posix() + ".npy"))
print(mels.shape)

In [None]:
lbd.specshow(mels[2])

In [None]:
plt.imshow(mels[0])

In [None]:
df["sr"].value_counts()