In [None]:
import os
import shutil
import  joblib
import numpy as np
import pandas as pd
import librosa as lb
import librosa.display
import matplotlib.pyplot as plt

from pathlib import Path
from tqdm.notebook import tqdm

# Data

In [None]:
DATA_ROOT = Path("../input/rfcx-species-audio-detection")
TRAIN_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/train")
TEST_AUDIO_ROOT = Path("../input/rfcx-species-audio-detection/test")

In [None]:
df_train = pd.DataFrame({
    "recording_id": [path.stem for path in Path(TRAIN_AUDIO_ROOT).glob("*.flac")],
})

df_test = pd.DataFrame({
    "recording_id": [path.stem for path in Path(TEST_AUDIO_ROOT).glob("*.flac")],
})

# Tools

In [None]:
class params:
    """
    Parameters used for the audio data
    """
    sr = 32000

    # Melspectrogram
    n_mels = 128
    fmin = 20
    fmax = sr // 2  # Shannon theorem

In [None]:
def load_audio(record, sr=16000, root=""):
    y, _ = lb.load(
        root.joinpath(record).with_suffix(".flac").as_posix(),
        sr=sr, 
    )
    return y

In [None]:
def compute_melspec(y, params):
    """
    Computes a mel-spectrogram and puts it at decibel scale
    Arguments:
        y {np array} -- signal
        params {AudioParams} -- Parameters to use for the spectrogram. Expected to have the attributes sr, n_mels, f_min, f_max
    Returns:
        np array -- Mel-spectrogram
    """
    melspec = lb.feature.melspectrogram(
        y, sr=params.sr, n_mels=params.n_mels, fmin=params.fmin, fmax=params.fmax,
    )

    melspec = lb.power_to_db(melspec).astype(np.float32)
    return melspec

# Example

In [None]:
y = load_audio(df_train["recording_id"][0], params.sr, TRAIN_AUDIO_ROOT)

In [None]:
melspec = compute_melspec(y, params)

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
img = librosa.display.specshow(
    melspec[:, :512], 
#     melspec, 
    sr=params.sr,
    x_axis='time', 
    y_axis='linear', 
    ax=ax)
fig.colorbar(img, ax=ax, format="%+2.f dB")
plt.show()

In [None]:
np.save("melspec.npy", melspec)

# Time comparison

In [None]:
%%timeit 

spec = np.load("melspec.npy")

In [None]:
%%timeit 

y = load_audio(df_train["recording_id"][0], params.sr, TRAIN_AUDIO_ROOT)
melspec = compute_melspec(y, params)

x 3000 improvement ! 

# Main

## Train

In [None]:
def load_and_save_train(location, record):
    y = load_audio(record, params.sr, TRAIN_AUDIO_ROOT)
    melspec = compute_melspec(y, params)

    np.save(location + record + ".npy", melspec)

In [None]:
OUT_TRAIN_1 = 'train1/'
os.mkdir(OUT_TRAIN_1)

OUT_TRAIN_2 = 'train2/'
os.mkdir(OUT_TRAIN_2)

OUT_TRAIN_3 = 'train3/'
os.mkdir(OUT_TRAIN_3)

OUT_TRAIN_4 = 'train4/'
os.mkdir(OUT_TRAIN_4)

In [None]:
print(df_train.shape)

chunk = (df_train.shape[0] // 4) + 1
for block in range(4):
    start = block * chunk
    stop = start + chunk
    location = f'train{block+1}/'
    
    print(location,start,stop)               
    
    _ = joblib.Parallel(n_jobs=8)(
        joblib.delayed(load_and_save_train)(location,record) for record in tqdm(df_train['recording_id'][start:stop].values)
    )    
    
    shutil.make_archive(location, 'zip', location)
    shutil.rmtree(location)    

In [None]:
# df_train['recording_id'][start:stop].values

In [None]:
# _ = joblib.Parallel(n_jobs=8)(
#     joblib.delayed(load_and_save_train)(OUT_TRAIN_1,record) for record in tqdm(df_train['recording_id'].values)
# )

# _ = joblib.Parallel(n_jobs=8)(
#     joblib.delayed(load_and_save_train)(OUT_TRAIN_2,record) for record in tqdm(df_train['recording_id'].values)
# )

# _ = joblib.Parallel(n_jobs=8)(
#     joblib.delayed(load_and_save_train)(OUT_TRAIN_3,record) for record in tqdm(df_train['recording_id'].values)
# )

# _ = joblib.Parallel(n_jobs=8)(
#     joblib.delayed(load_and_save_train)(OUT_TRAIN_4,record) for record in tqdm(df_train['recording_id'].values)
# )

In [None]:
# shutil.make_archive(OUT_TRAIN, 'zip', OUT_TRAIN)
# shutil.rmtree(OUT_TRAIN)

## Test

In [None]:
def load_and_save_test(record):
    y = load_audio(record, params.sr, TEST_AUDIO_ROOT)
    melspec = compute_melspec(y, params)

    np.save(OUT_TEST + record + ".npy", melspec)

In [None]:
OUT_TEST = 'test/'
os.mkdir(OUT_TEST)

In [None]:
_ = joblib.Parallel(n_jobs=8)(
    joblib.delayed(load_and_save_test)(record) for record in tqdm(df_test['recording_id'].values)
)

In [None]:
shutil.make_archive(OUT_TEST, 'zip', OUT_TEST)
shutil.rmtree(OUT_TEST)