# librosaの勉強

+ https://librosa.org/doc/latest/tutorial.html

Library 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, IFrame, display # jupyterで再生につかう

In [None]:
print(librosa.__version__)

In [None]:
# 1. 音声ファイルのpathを設定
#filename  = librosa.example('nutcracker')
filename ='../input/birdsong-resampled-train-audio-04/wooscj2/XC67042.wav'

# 2. `y`：波形
#    `sr`：サンプリングレート
# デフォルトだとモノラル、sr=22050
y, sr = librosa.load(filename )
# type(y) -> numpy.ndarray
# y.shape -> (144577,) 
# sr      -> 22050

# 2-1. サンプリングレートを指定できる
y, sr = librosa.load(filename, sr=32000)
# y.shape -> (209816,)
# sr      -> 32000

# 3. デフォルトのビートトラッカーを実行(bpm推定)
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
# type(tempo) -> numpy.float64
# tempo       -> 113.63636363636364
# beat_frames -> array([  3,  38,  71, 101, 132, 165, 196])

# 4. ビートイベントのフレームインデックスをタイムスタンプに変換
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
# beat_times -> array([0.048, 0.608, 1.136, 1.616, 2.112, 2.64 , 3.136])

In [None]:
# 表示
librosa.display.waveplot(y, sr=sr)
Audio(y, rate=sr)

### 調波打楽器音分離 (HPSS: Hermonic/Percussive Source Seperation)

打楽器の音と非打楽器（調波楽器）の音を分離する  
https://www.wizard-notes.com/entry/music-analysis/hpss#f-4f2dfd09

In [None]:
# Separate harmonics and percussives into two waveforms
y_harmonic, y_percussive = librosa.effects.hpss(y)

# Plot
plt.subplot(3,1,1)
plt.plot(y)
plt.title("Original signal")

plt.subplot(3,1,2)
plt.plot(y_harmonic)
plt.title("Harmonic signal")

plt.subplot(3,1,3)
plt.plot(y_percussive)
plt.title("Percussive signal")

plt.tight_layout()
plt.show()

print('y_harmonic')
display(Audio(y_harmonic,rate=sr))
print('y_percussive')
display(Audio(y_percussive,rate=sr))

### メル周波数ケプストラム係数(MFCC:Mel-Frequency Cepstrum Coefficients)

+ https://www.wizard-notes.com/entry/music-analysis/insts-timbre-with-mfcc
  + n_mfcc:分析や機械学習で使う場合は12～24くらいの次元数が良く使われます。

```メル尺度（メルしゃくど、英語: mel scale）は、音高の知覚的尺度である。メル尺度の差が同じであれば、人間が感じる音高の差が同じになることを意図している。```  
人間の聴覚特性に合わせた変換

### log-melspectrogram

```MFCCの離散コサイン変換無いバージョンです。```
+ [機械学習のための音声の特徴量ざっくりメモ (Librosa ,numpy)](https://qiita.com/yutalfa/items/dbd172138db60d461a56)

In [None]:
mfcc = librosa.feature.mfcc(y=y, sr=sr,n_mfcc=24)
#mfcc.shape -> (24, 410)
librosa.display.specshow(mfcc, sr=sr)
plt.colorbar()

In [None]:
melspec = librosa.feature.melspectrogram(y, sr=sr, n_mels=128, fmin=20, fmax=16000)
# デシベル変換
melspec = librosa.power_to_db(melspec).astype(np.float32)
# melspec.shape -> (128, 410)
librosa.display.specshow(melspec, sr=sr)
plt.colorbar()

In [None]:
# 入力の列間の（平滑化された）一次差を計算．
# mfccと同じ形状のものを出力
mfcc_delta = librosa.feature.delta(mfcc)
#mfcc_delta.shape -> (24, 410)
librosa.display.specshow(mfcc_delta, sr=sr)
plt.colorbar()

その他処理は下記
https://librosa.org/doc/latest/tutorial.html

### sfとlibrosaの比較

In [None]:
import soundfile as sf
filename ='../input/birdsong-resampled-train-audio-04/wooscj2/XC67042.wav'

y, sr = librosa.load(filename , sr=32000)
y_1, sr_1 = sf.read(filename)

print(f"librosa \n shape{y.shape},\n type{type(y[0])}\n size{y.__sizeof__()}")
print("-"*50)
print(f"soundfile \n shape{y_1.shape},\n type{type(y_1[0])}\n size{y_1.__sizeof__()}")

# ノイズ消す実験

In [None]:
# trainは下記コード(pip)でもいいがtestでは使えないので下記の長文のコードをそのままはるのがいいか？
#!pip install noisereduce

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import noisereduce as nr

import soundfile as sf
import librosa
import librosa.display
from IPython.display import Audio, IFrame, display # jupyterで再生につかう

from scipy.ndimage import maximum_filter1d

In [None]:
# https://github.com/timsainb/noisereduce
import scipy.signal
import numpy as np
import librosa
from tqdm.autonotebook import tqdm
import warnings

def _stft(y, n_fft, hop_length, win_length, use_tensorflow=False):
    if use_tensorflow:
        # return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True)
        return _stft_tensorflow(y, n_fft, hop_length, win_length)
    else:
        return librosa.stft(
            y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
        )


def _istft(y, n_fft, hop_length, win_length, use_tensorflow=False):
    if use_tensorflow:
        # return librosa.istft(y, hop_length, win_length)
        return _istft_tensorflow(y.T, n_fft, hop_length, win_length)
    else:
        return librosa.istft(y, hop_length, win_length)


def _stft_librosa(y, n_fft, hop_length, win_length):
    return librosa.stft(
        y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=True
    )


def _istft_librosa(y, hop_length, win_length):
    return librosa.istft(y, hop_length, win_length)


def _stft_tensorflow(y, n_fft, hop_length, win_length):
    return (
        tf.signal.stft(
            y,
            win_length,
            hop_length,
            n_fft,
            pad_end=True,
            window_fn=tf.signal.hann_window,
        )
        .numpy()
        .T
    )


def _istft_tensorflow(y, n_fft, hop_length, win_length):
    return tf.signal.inverse_stft(
        y.astype(np.complex64), win_length, hop_length, n_fft
    ).numpy()


def _amp_to_db(x):
    return librosa.core.amplitude_to_db(x, ref=1.0, amin=1e-20, top_db=80.0)


def _db_to_amp(x,):
    return librosa.core.db_to_amplitude(x, ref=1.0)


def update_pbar(pbar, message):
    """ writes to progress bar
    """
    if pbar is not None:
        pbar.set_description(message)
        pbar.update(1)


def _smoothing_filter(n_grad_freq, n_grad_time):
    """Generates a filter to smooth the mask for the spectrogram
        
    Arguments:
        n_grad_freq {[type]} -- [how many frequency channels to smooth over with the mask.]
        n_grad_time {[type]} -- [how many time channels to smooth over with the mask.]
    """

    smoothing_filter = np.outer(
        np.concatenate(
            [
                np.linspace(0, 1, n_grad_freq + 1, endpoint=False),
                np.linspace(1, 0, n_grad_freq + 2),
            ]
        )[1:-1],
        np.concatenate(
            [
                np.linspace(0, 1, n_grad_time + 1, endpoint=False),
                np.linspace(1, 0, n_grad_time + 2),
            ]
        )[1:-1],
    )
    smoothing_filter = smoothing_filter / np.sum(smoothing_filter)
    return smoothing_filter


def mask_signal(sig_stft, sig_mask):
    """ Reduces amplitude of time/frequency regions of a spectrogram based upon a mask 
        
    Arguments:
        sig_stft {[type]} -- spectrogram of signal
        sig_mask {[type]} -- mask to apply to signal
    
    Returns:
        sig_stft_amp [type] -- masked signal
    """
    sig_stft_amp = sig_stft * (1 - sig_mask)
    return sig_stft_amp


def convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow=False):
    """ Convolves a gaussian filter with a mask (or any image)
    
    Arguments:
        sig_mask {[type]} -- The signal mask
        smoothing_filter {[type]} -- the filter to convolve
    
    Keyword Arguments:
        use_tensorflow {bool} -- use tensorflow.signal or scipy.signal (default: {False})
    """
    if use_tensorflow:
        smoothing_filter = smoothing_filter * (
            (np.shape(smoothing_filter)[1] - 1) / 2 + 1
        )
        smoothing_filter = smoothing_filter[:, :, tf.newaxis, tf.newaxis].astype(
            "float32"
        )
        img = sig_mask[:, :, tf.newaxis, tf.newaxis].astype("float32")
        return (
            tf.nn.conv2d(img, smoothing_filter, strides=[1, 1, 1, 1], padding="SAME")
            .numpy()
            .squeeze()
        )
    else:
        return scipy.signal.fftconvolve(sig_mask, smoothing_filter, mode="same")


def load_tensorflow(verbose=False):
    """loads tensorflow if it is available
    Used as a backend for fft and convolution
    
    Returns:
        bool -- whether to use tensorflow
    """
    try:
        # import tensorflow as tf
        globals()["tf"] = __import__("tensorflow")

        if verbose:
            available_gpus = tf.config.experimental.list_physical_devices("GPU")
            print("GPUs available: {}".format(available_gpus))
        if int(tf.__version__[0]) < 2:
            warnings.warn(
                "Tensorflow version is below 2.0, reverting to non-tensorflow backend"
            )
            return False
    except:
        warnings.warn(
            "Tensorflow is not installed, reverting to non-tensorflow backend"
        )
        return False
    return True


def reduce_noise(
    audio_clip,
    noise_clip,
    n_grad_freq=2,
    n_grad_time=4,
    n_fft=2048,
    win_length=2048,
    hop_length=512,
    n_std_thresh=1.5,
    prop_decrease=1.0,
    pad_clipping=True,
    use_tensorflow=False,
    verbose=False,
):
    """Remove noise from audio based upon a clip containing only noise
    Args:
        audio_clip (array): The first parameter.
        noise_clip (array): The second parameter.
        n_grad_freq (int): how many frequency channels to smooth over with the mask.
        n_grad_time (int): how many time channels to smooth over with the mask.
        n_fft (int): number audio of frames between STFT columns.
        win_length (int): Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`..
        hop_length (int):number audio of frames between STFT columns.
        n_std_thresh (int): how many standard deviations louder than the mean dB of the noise (at each frequency level) to be considered signal
        prop_decrease (float): To what extent should you decrease noise (1 = all, 0 = none)
        pad_clipping (bool): Pad the signals with zeros to ensure that the reconstructed data is equal length to the data
        use_tensorflow (bool): Use tensorflow as a backend for convolution and fft to speed up computation
        verbose (bool): Whether to plot the steps of the algorithm
    Returns:
        array: The recovered signal with noise subtracted
    """
    # load tensorflow if you are using it as a backend
    if use_tensorflow:
        use_tensorflow = load_tensorflow(verbose)

    if verbose:
        pbar = tqdm(total=7)
    else:
        pbar = None

    update_pbar(pbar, "STFT on noise")
    # STFT over noise
    noise_stft = _stft(
        noise_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
    )
    noise_stft_db = _amp_to_db(np.abs(noise_stft))  # convert to dB
    # Calculate statistics over noise
    update_pbar(pbar, "STFT on signal")
    mean_freq_noise = np.mean(noise_stft_db, axis=1)
    std_freq_noise = np.std(noise_stft_db, axis=1)
    noise_thresh = mean_freq_noise + std_freq_noise * n_std_thresh
    # STFT over signal
    update_pbar(pbar, "STFT on signal")

    # pad signal with zeros to avoid extra frames being clipped if desired
    if pad_clipping:
        nsamp = len(audio_clip)
        audio_clip = np.pad(audio_clip, [0, hop_length], mode="constant")

    sig_stft = _stft(
        audio_clip, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
    )
    # spectrogram of signal in dB
    sig_stft_db = _amp_to_db(np.abs(sig_stft))
    update_pbar(pbar, "Generate mask")

    # calculate the threshold for each frequency/time bin
    db_thresh = np.repeat(
        np.reshape(noise_thresh, [1, len(mean_freq_noise)]),
        np.shape(sig_stft_db)[1],
        axis=0,
    ).T
    # mask if the signal is above the threshold
    sig_mask = sig_stft_db < db_thresh
    update_pbar(pbar, "Smooth mask")
    # Create a smoothing filter for the mask in time and frequency
    smoothing_filter = _smoothing_filter(n_grad_freq, n_grad_time)

    # convolve the mask with a smoothing filter
    sig_mask = convolve_gaussian(sig_mask, smoothing_filter, use_tensorflow)

    sig_mask = sig_mask * prop_decrease
    update_pbar(pbar, "Apply mask")
    # mask the signal

    sig_stft_amp = mask_signal(sig_stft, sig_mask)

    update_pbar(pbar, "Recover signal")
    # recover the signal
    recovered_signal = _istft(
        sig_stft_amp, n_fft, hop_length, win_length, use_tensorflow=use_tensorflow
    )
    # fix the recovered signal length if padding signal
    if pad_clipping:
        recovered_signal = librosa.util.fix_length(recovered_signal, nsamp)

    recovered_spec = _amp_to_db(
        np.abs(
            _stft(
                recovered_signal,
                n_fft,
                hop_length,
                win_length,
                use_tensorflow=use_tensorflow,
            )
        )
    )

    return recovered_signal

In [None]:
def envelope(y, rate, threshold):
    mask = []
    y_mean = maximum_filter1d(np.abs(y), mode="constant", size=rate//20)
    for mean in y_mean:
        if mean > threshold:
            mask.append(True)
        else:
            mask.append(False)
    return mask, y_mean

In [None]:
# ノイズ消すやつ
filename ='../input/birdsong-resampled-train-audio-04/sagthr/XC112650.wav' # 長い
#filename ='../input/birdsong-resampled-train-audio-04/wooscj2/XC67042.wav' # 短い
filename = '../input/birdsong-recognition/train_audio/bkcchi/XC114073.mp3'
#y, sr = sf.read(filename)
y, sr = librosa.load(filename,sr=32000,mono=True,res_type="kaiser_fast")
print("sampling rate:", sr)
plt.plot(y)
plt.show()

Audio(data=y, rate=sr)

In [None]:
thr = 0.25
mask, env = envelope(y, sr, thr)

plt.plot(y[mask], label="birdcall")
plt.plot(y[np.logical_not(mask)], label="noise")
plt.legend(bbox_to_anchor=(1, 1), loc='upper right')

In [None]:
y_denoise = reduce_noise(audio_clip=y, noise_clip=y[np.logical_not(mask)], verbose=True)


In [None]:
print(f'{y.shape}{y_denoise.shape}')
print('y')
display(Audio(y,rate=sr))
print('y_denoise')
display(Audio(y_denoise,rate=sr))

In [None]:
melspec = librosa.feature.melspectrogram(y, sr=sr, n_mels=128, fmin=20, fmax=16000)
# デシベル変換
melspec = librosa.power_to_db(melspec).astype(np.float32)
# melspec.shape -> (128, 410)
librosa.display.specshow(melspec, sr=sr)
plt.colorbar()

In [None]:
melspec_denoise = librosa.feature.melspectrogram(y_denoise, sr=sr, n_mels=128, fmin=20, fmax=16000)
# デシベル変換
melspec_denoise = librosa.power_to_db(melspec_denoise).astype(np.float32)
# melspec.shape -> (128, 410)
librosa.display.specshow(melspec_denoise, sr=sr)
plt.colorbar()

## SpecAugment試験

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, IFrame, display # jupyterで再生につかう
import soundfile as sf
import random

In [None]:
filename ='../input/birdsong-resampled-train-audio-04/wooscj2/XC67042.wav'

In [None]:
class SpecAugmentTransform(object):
    def __init__(self,
                 do_time_warp=False, #未完成なのでFalse
                 do_freq_mask=True, 
                 do_time_mask=True,
                 num_freq_mask=1,
                 F=27,
                 num_time_mask=1,
                 time=100):
        self.do_time_warp = do_time_warp
        self.do_freq_mask = do_freq_mask
        self.do_time_mask = do_time_mask
        self.num_freq_mask = num_freq_mask
        self.F = F
        self.num_time_mask = num_time_mask
        self.time = time

    # コールされる関数
    def __call__(self, spec):
        return self.transform(spec)

    # 1文をidに変換する
    def transform(self, spec):
        # 指定があれば始まりと終わりの記号を追加する
        if self.do_time_warp:
            spec = self.time_warp(spec=spec, W=5)
        if self.do_freq_mask:
            spec = self.freq_mask(spec=spec, F=self.F, num_masks=self.num_freq_mask)
        if self.do_time_mask:
            spec = self.time_mask(spec=spec, time=self.time, num_masks=self.num_time_mask)

        return spec

    #未完成
    def time_warp(self, spec, W=5):
        num_rows = spec.shape[1]
        spec_len = spec.shape[2]
        device = spec.device

        y = num_rows//2
        horizontal_line_at_ctr = spec[0][y]
        assert len(horizontal_line_at_ctr) == spec_len

        point_to_warp = horizontal_line_at_ctr[random.randrange(W, spec_len - W)]
        assert isinstance(point_to_warp, torch.Tensor)

        # Uniform distribution from (0,W) with chance to be up to W negative
        dist_to_warp = random.randrange(-W, W)
        src_pts, dest_pts = (torch.tensor([[[y, point_to_warp]]], device=device), 
                             torch.tensor([[[y, point_to_warp + dist_to_warp]]], device=device))
        warped_spectro, dense_flows = sparse_image_warp(spec, src_pts, dest_pts)
        return warped_spectro.squeeze(3)
    
    #ランダムな周波数帯をランダムなサイズ分マスク
    def freq_mask(self, spec, F, num_masks):
        test = spec.copy()
        num_mel_channels = test.shape[0]
        for i in range(0, num_masks):        
            freq = random.randrange(0, F)
            zero = random.randrange(0, num_mel_channels - freq)
            # avoids randrange error if values are equal and range is empty
            if (zero == zero + freq): continue
            mask_end = random.randrange(zero, zero + freq) 
            test[zero:mask_end] = test.mean()
        return test
    
    #ランダムは時間帯をランダムなサイズ分マスク
    def time_mask(self, spec, time, num_masks):
        test = spec.copy()
        length = test.shape[1]
        for i in range(0, num_masks):
            t = random.randrange(0, time)
            zero = random.randrange(0, length - t)
            if (zero == zero + t): continue
            mask_end = random.randrange(zero, zero + t)
            test[:,zero:mask_end] = test.mean()
        return test

### SpecAugment用のtransformerを定義

In [None]:
transform = SpecAugmentTransform(do_time_warp=False, 
                                 do_freq_mask=True, 
                                 do_time_mask=True,
                                 num_freq_mask=3,
                                 num_time_mask=5)

In [None]:
PERIOD = 5
def test_spectrogram(file_path, code, spectrogram_transforms=None):
    wav_path = file_path
    ebird_code = code
    
    y, sr = sf.read(wav_path)

    
    len_y = len(y)
    effective_length = sr * PERIOD
    if len_y < effective_length:
        new_y = np.zeros(effective_length, dtype=y.dtype)
        start = np.random.randint(effective_length - len_y)
        new_y[start:start + len_y] = y
        y = new_y.astype(np.float32)
    elif len_y > effective_length:
        start = np.random.randint(len_y - effective_length)
        y = y[start:start + effective_length].astype(np.float32)
    else:
        y = y.astype(np.float32)

    melspec = librosa.feature.melspectrogram(y, sr=sr)
    melspec = librosa.power_to_db(melspec).astype(np.float32)

    if spectrogram_transforms:
        melspec = spectrogram_transforms(melspec)
    else:
        pass

    return melspec, sr

### SpecAugmentなし

In [None]:
melspec, sr = test_spectrogram(file_path=filename, code="aaa", spectrogram_transforms=None)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

## SpecAugmentあり

In [None]:
auged_melspec, sr = test_spectrogram(file_path=filename, code="aaa", spectrogram_transforms=transform)
librosa.display.specshow(auged_melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

## MixUp

## 手法の超ざっくり説明
 * ランダムに抽出した２つのデータをミックスして１つにする
 * ラベルデータもミックスして１つにする


### ハイパーパラメータαについて
 * 0に近いほど、λは0か1に数値となる（Beta分布が鍋型）。
 * 1に近いほど、λは0～1の間のランダムな数値となる（Beta分布が一様分布）。
 * 1より大きいほど、λは0.5に近い数値となる（Beta分布が釣り鐘型）。

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, IFrame, display # jupyterで再生につかう
import soundfile as sf
import random

In [None]:
PERIOD = 5

In [None]:
BIRD_CODE = {
    'aldfly': 0, 'ameavo': 1, 'amebit': 2, 'amecro': 3, 'amegfi': 4,
    'amekes': 5, 'amepip': 6, 'amered': 7, 'amerob': 8, 'amewig': 9,
    'amewoo': 10, 'amtspa': 11, 'annhum': 12, 'astfly': 13, 'baisan': 14,
    'baleag': 15, 'balori': 16, 'banswa': 17, 'barswa': 18, 'bawwar': 19,
    'belkin1': 20, 'belspa2': 21, 'bewwre': 22, 'bkbcuc': 23, 'bkbmag1': 24,
    'bkbwar': 25, 'bkcchi': 26, 'bkchum': 27, 'bkhgro': 28, 'bkpwar': 29,
    'bktspa': 30, 'blkpho': 31, 'blugrb1': 32, 'blujay': 33, 'bnhcow': 34,
    'boboli': 35, 'bongul': 36, 'brdowl': 37, 'brebla': 38, 'brespa': 39,
    'brncre': 40, 'brnthr': 41, 'brthum': 42, 'brwhaw': 43, 'btbwar': 44,
    'btnwar': 45, 'btywar': 46, 'buffle': 47, 'buggna': 48, 'buhvir': 49,
    'bulori': 50, 'bushti': 51, 'buwtea': 52, 'buwwar': 53, 'cacwre': 54,
    'calgul': 55, 'calqua': 56, 'camwar': 57, 'cangoo': 58, 'canwar': 59,
    'canwre': 60, 'carwre': 61, 'casfin': 62, 'caster1': 63, 'casvir': 64,
    'cedwax': 65, 'chispa': 66, 'chiswi': 67, 'chswar': 68, 'chukar': 69,
    'clanut': 70, 'cliswa': 71, 'comgol': 72, 'comgra': 73, 'comloo': 74,
    'commer': 75, 'comnig': 76, 'comrav': 77, 'comred': 78, 'comter': 79,
    'comyel': 80, 'coohaw': 81, 'coshum': 82, 'cowscj1': 83, 'daejun': 84,
    'doccor': 85, 'dowwoo': 86, 'dusfly': 87, 'eargre': 88, 'easblu': 89,
    'easkin': 90, 'easmea': 91, 'easpho': 92, 'eastow': 93, 'eawpew': 94,
    'eucdov': 95, 'eursta': 96, 'evegro': 97, 'fiespa': 98, 'fiscro': 99,
    'foxspa': 100, 'gadwal': 101, 'gcrfin': 102, 'gnttow': 103, 'gnwtea': 104,
    'gockin': 105, 'gocspa': 106, 'goleag': 107, 'grbher3': 108, 'grcfly': 109,
    'greegr': 110, 'greroa': 111, 'greyel': 112, 'grhowl': 113, 'grnher': 114,
    'grtgra': 115, 'grycat': 116, 'gryfly': 117, 'haiwoo': 118, 'hamfly': 119,
    'hergul': 120, 'herthr': 121, 'hoomer': 122, 'hoowar': 123, 'horgre': 124,
    'horlar': 125, 'houfin': 126, 'houspa': 127, 'houwre': 128, 'indbun': 129,
    'juntit1': 130, 'killde': 131, 'labwoo': 132, 'larspa': 133, 'lazbun': 134,
    'leabit': 135, 'leafly': 136, 'leasan': 137, 'lecthr': 138, 'lesgol': 139,
    'lesnig': 140, 'lesyel': 141, 'lewwoo': 142, 'linspa': 143, 'lobcur': 144,
    'lobdow': 145, 'logshr': 146, 'lotduc': 147, 'louwat': 148, 'macwar': 149,
    'magwar': 150, 'mallar3': 151, 'marwre': 152, 'merlin': 153, 'moublu': 154,
    'mouchi': 155, 'moudov': 156, 'norcar': 157, 'norfli': 158, 'norhar2': 159,
    'normoc': 160, 'norpar': 161, 'norpin': 162, 'norsho': 163, 'norwat': 164,
    'nrwswa': 165, 'nutwoo': 166, 'olsfly': 167, 'orcwar': 168, 'osprey': 169,
    'ovenbi1': 170, 'palwar': 171, 'pasfly': 172, 'pecsan': 173, 'perfal': 174,
    'phaino': 175, 'pibgre': 176, 'pilwoo': 177, 'pingro': 178, 'pinjay': 179,
    'pinsis': 180, 'pinwar': 181, 'plsvir': 182, 'prawar': 183, 'purfin': 184,
    'pygnut': 185, 'rebmer': 186, 'rebnut': 187, 'rebsap': 188, 'rebwoo': 189,
    'redcro': 190, 'redhea': 191, 'reevir1': 192, 'renpha': 193, 'reshaw': 194,
    'rethaw': 195, 'rewbla': 196, 'ribgul': 197, 'rinduc': 198, 'robgro': 199,
    'rocpig': 200, 'rocwre': 201, 'rthhum': 202, 'ruckin': 203, 'rudduc': 204,
    'rufgro': 205, 'rufhum': 206, 'rusbla': 207, 'sagspa1': 208, 'sagthr': 209,
    'savspa': 210, 'saypho': 211, 'scatan': 212, 'scoori': 213, 'semplo': 214,
    'semsan': 215, 'sheowl': 216, 'shshaw': 217, 'snobun': 218, 'snogoo': 219,
    'solsan': 220, 'sonspa': 221, 'sora': 222, 'sposan': 223, 'spotow': 224,
    'stejay': 225, 'swahaw': 226, 'swaspa': 227, 'swathr': 228, 'treswa': 229,
    'truswa': 230, 'tuftit': 231, 'tunswa': 232, 'veery': 233, 'vesspa': 234,
    'vigswa': 235, 'warvir': 236, 'wesblu': 237, 'wesgre': 238, 'weskin': 239,
    'wesmea': 240, 'wessan': 241, 'westan': 242, 'wewpew': 243, 'whbnut': 244,
    'whcspa': 245, 'whfibi': 246, 'whtspa': 247, 'whtswi': 248, 'wilfly': 249,
    'wilsni1': 250, 'wiltur': 251, 'winwre3': 252, 'wlswar': 253, 'wooduc': 254,
    'wooscj2': 255, 'woothr': 256, 'y00475': 257, 'yebfly': 258, 'yebsap': 259,
    'yehbla': 260, 'yelwar': 261, 'yerwar': 262, 'yetvir': 263
}

INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}

In [None]:
sample_file_list = [['/kaggle/input/birdsong-resampled-train-audio-00/aldfly/XC134874.wav','aldfly'],
                    ['/kaggle/input/birdsong-resampled-train-audio-00/aldfly/XC135454.wav', 'aldfly']]

In [None]:
class MixUpTransform(object):
    def __init__(self,
                 file_list,
                 alpha=0.4):
        self.file_list = file_list
        self.alpha = alpha
        
        
    # コールされる関数
    def __call__(self, idx, melspec, one_hot_label):
        return self.transform(idx, melspec, one_hot_label)


    def transform(self, idx, melspec, one_hot_label):
        #ペアデータの用意
        pair_data_idx = self._get_pair_index(idx)
        pair_wav_path, pair_ebird_code = self.file_list[pair_data_idx]
        pair_melspec = self._convert_mel_spectrogram(pair_wav_path)
        pair_one_hot_label = self._convert_one_hot_label(pair_ebird_code)
        
        r = np.random.beta(self.alpha, self.alpha, 1)[0]
        # 画像、ラベルを混ぜる（クリップしないと範囲外になることがある）
        mixed_melspec = r*melspec + (1 - r)*pair_melspec
        mixed_label = r*one_hot_label + (1 - r)*pair_one_hot_label
#         mixed_melspec = np.clip(r*melspec + (1 - r)*pair_melspec, 0, 1)
#         mixed_label = np.clip(r*one_hot_label + (1 - r)*pair_one_hot_label, 0, 1)
        
        return mixed_melspec, mixed_label

    
    def _convert_mel_spectrogram(self, wav_path):
        y, sr = sf.read(wav_path)

        len_y = len(y)
        effective_length = sr * PERIOD
        if len_y < effective_length:
            new_y = np.zeros(effective_length, dtype=y.dtype)
            start = np.random.randint(effective_length - len_y)
            new_y[start:start + len_y] = y
            y = new_y.astype(np.float32)
        elif len_y > effective_length:
            start = np.random.randint(len_y - effective_length)
            y = y[start:start + effective_length].astype(np.float32)
        else:
            y = y.astype(np.float32)

        melspec = librosa.feature.melspectrogram(y, sr=sr)
        melspec = librosa.power_to_db(melspec).astype(np.float32)

        return melspec
    
    
    def _convert_one_hot_label(self, ebird_code):
        labels = np.zeros(len(BIRD_CODE), dtype="f")
        labels[BIRD_CODE[ebird_code]] = 1

        return labels
    
    
    def _get_pair_index(self, idx):    
        r = list(range(0, idx)) + list(range(idx+1, len(self.file_list)))
        
        return random.choice(r)

In [None]:
transform = MixUpTransform(sample_file_list, alpha=0.4)

In [None]:
PERIOD = 5
def test_spectrogram(file_list, idx, mixup_transforms=None):
    wav_path, ebird_code = file_list[idx]
    y, sr = sf.read(wav_path)

    len_y = len(y)
    effective_length = sr * PERIOD
    if len_y < effective_length:
        new_y = np.zeros(effective_length, dtype=y.dtype)
        start = np.random.randint(effective_length - len_y)
        new_y[start:start + len_y] = y
        y = new_y.astype(np.float32)
    elif len_y > effective_length:
        start = np.random.randint(len_y - effective_length)
        y = y[start:start + effective_length].astype(np.float32)
    else:
        y = y.astype(np.float32)

    melspec = librosa.feature.melspectrogram(y, sr=sr)
    melspec = librosa.power_to_db(melspec).astype(np.float32)

    label = np.zeros(len(BIRD_CODE), dtype="f")
    label[BIRD_CODE[ebird_code]] = 1
    
    if mixup_transforms:
        melspec, label = mixup_transforms(idx, melspec, label)
    else:
        pass


    return melspec, label, sr

## mixupなし

In [None]:
melspec, label, sr = test_spectrogram(file_list=sample_file_list, idx=0, mixup_transforms=None)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

In [None]:
melspec, label, sr = test_spectrogram(file_list=sample_file_list, idx=1, mixup_transforms=None)
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

## mixupあり

In [None]:
auged_melspec, label, sr = test_spectrogram(file_list=sample_file_list, idx=0, mixup_transforms=transform)
librosa.display.specshow(auged_melspec, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar()

https://github.com/ebouteillon/freesound-audio-tagging-2019/blob/master/code/training-cnn-model1.ipynb
ここからαは0.4にした