I've released [this crazingly fast (< 10 mins) kernel](https://www.kaggle.com/kneroma/inference-tpu-rfcx-audio-detection-fast) which uses a set of pre-computed MFCCs. 

The problem is that those MFCCs are static, and if you change any params (**DURATION**, **STRIDE**, ...), you can no more use them. This is not fair. I will release my code that can help  you re-computing them whenever you need.

* I use **joblib** to parallelize the computations, so it must require less than 1 hour to compute the MFCCs for the whole test dataset, and just 30 mins if STRIDE = DURATION
* I directly use **soundfile** to read audios instead of **librosa** as soundfile is faster

<h3><font color="blue">Is this kernel useful for you ? Don't forget upvoting it, it really  motivates me in enhancing my work and sharing it with you :)</h3></font>

In [None]:
import numpy as np
import librosa as lb
import soundfile as sf
import pandas as pd
from pathlib import Path

from tqdm.notebook import tqdm


import time

In [None]:
NUM_CLASSES = 24
SR = 32_000
DURATION = 10
STRIDE = DURATION//2

BATCH_START = 0
BATCH_SIZE = 400

NJOBS = 2

TEST_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/test")
TEST_MFCC_SAVE_ROOT = Path(f"test_mfcc_d{DURATION}_s{STRIDE}_sr{SR}_{BATCH_START:04d}_{BATCH_START+BATCH_SIZE:04d}")
TEST_MFCC_SAVE_ROOT.mkdir(exist_ok=True)

In [None]:
class MelSpecComputer:
    def __init__(self, sr, n_mels, fmin, fmax):
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax

    def __call__(self, y):

        melspec = lb.feature.melspectrogram(
            y, sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax,
        )

        melspec = lb.power_to_db(melspec).astype(np.float32)
        return melspec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V


def normalize(image, mean=None, std=None):
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)


def crop_or_pad(y, length, sr, is_train=True):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
    elif len(y) > length:
        if not is_train:
            start = 0
        else:
            start = np.random.randint(len(y) - length)

        y = y[start:start + length]

    y = y.astype(np.float32, copy=False)

    return y

In [None]:
class RFCXDataset:

    def __init__(self, data, sr, n_mels=128, fmin=0, fmax=None, num_classes=NUM_CLASSES, root=None):

        self.data = data
        
        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2


        self.num_classes = num_classes
#         self.duration = duration
#         self.audio_length = self.duration*self.sr
        
        self.root =  root or TEST_AUDIO_ROOT

        self.mel_spec_computer = MelSpecComputer(sr=self.sr, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax)
        
        self.res_type = "kaiser_best"


    def __len__(self):
        return len(self.data)
    
    def load(self, record):
        y, _ = lb.load(self.root.joinpath(record).with_suffix(".flac").as_posix(), sr=self.sr, res_type=self.res_type)
        return y
    
    def load2(self, record):
        y, orig_sr = sf.read(self.root.joinpath(record).with_suffix(".flac").as_posix())
        y = lb.resample(y, orig_sr=orig_sr, target_sr=self.sr, res_type=self.res_type)
        return y
    
    def read_index(self, idx):
        d = self.data.iloc[idx]
        record = d["recording_id"]
        
        y = self.load2(record)
        
        window = DURATION*self.sr
        stride = STRIDE*self.sr
            
        y = np.stack([y[i:i+window] for i in range(0, 60*self.sr+stride-window, stride)])

#         y = crop_or_pad(y, self.audio_length, sr=self.sr)
        
        return y
            
    def process(self, y):
        melspec = self.mel_spec_computer(y) 
        image = mono_to_color(melspec)
        image = normalize(image, mean=None, std=None)
        return image

    def __getitem__(self, idx):

        y = self.read_index(idx)
        
        image = np.stack([self.process(_y) for _y in y])

        return image
    
    def to_mfcc(self, idx):
        record = self.data.iloc[idx]["recording_id"]
        mfcc = self[idx]
        
        np.save(TEST_MFCC_SAVE_ROOT.joinpath(record).with_suffix(".npy").as_posix(), mfcc)

In [None]:
# def get_duration(audio_name, root=TEST_AUDIO_ROOT):
#     return lb.get_duration(filename=root.joinpath(audio_name).with_suffix(".flac"))

In [None]:
%%time

data = pd.DataFrame({
    "recording_id": [path.stem for path in Path(TEST_AUDIO_ROOT).glob("*.flac")],
})

print(data.shape)
data.head()

In [None]:
ds = RFCXDataset(data=data, sr=SR)

In [None]:
%%time

x = ds[1]
print(x.shape)

In [None]:
x.nbytes/(1024**2)

In [None]:
# %timeit ds.to_mfcc(0)

In [None]:
import joblib
pool = joblib.Parallel(n_jobs=NJOBS)

In [None]:
mapper = joblib.delayed(ds.to_mfcc)
tasks = []
for idx in range(BATCH_START, min(BATCH_START + BATCH_SIZE, len(ds))):
# for idx in range(25):
    tasks.append(mapper(idx))
    
res = pool(tqdm(tasks))