# Data augmentation with audiomentation

Here is example notebook of [audiomentation](https://github.com/iver56/audiomentations), a library for audio data augmentaion.

![](http://)I'm new to this community, so any suggestions for better notebook/results/competition are welceome.

Also, I haven't used this augmentation for submission at this point. Keen to check if these augmentation works well.


## Prepare for execution

In [None]:
!pip install audiomentations
!pip install wavio pyloudnorm ffmpeg pydub # install extra dependencies

Load modules and define some useful functions. 
Some part of code inherit from [ResNet34 More Augmentations+Mixup+TTA (Inference)](https://www.kaggle.com/khoongweihao/resnet34-more-augmentations-mixup-tta-inference) and [All-in-one RFCX baseline for beginners](https://www.kaggle.com/c/rfcx-species-audio-detection).

In [None]:
# Define some helper functions for pretty figures.
import csv
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Audio, display

def show_signal(raw_wav, sr, title=None):
    fig = plt.figure(figsize=(5, 5))
    ax1 = plt.subplot(2, 1, 1)
    if title:
        ax1.set_title(title)
    ax1.plot(np.arange(len(raw_wav))/sr, raw_wav)
    
    ax2 = plt.subplot(2, 1, 2, sharex=ax1)
    wav_stft = librosa.amplitude_to_db(np.abs(librosa.stft(raw_wav)), ref=np.max)
    librosa.display.specshow(wav_stft, sr=sr, x_axis='time', y_axis='mel')
    
    return Audio((raw_wav*2**15).astype(np.int16), rate=sr)
    
    
def compare_signals(wav1, sr1, wav2, sr2, titles=None):
    fig = plt.figure(figsize=(10, 5))
    ax1 = plt.subplot(2, 2, 1)
    if titles:
        ax1.set_title(titles[0])
    ax1.plot(np.arange(len(wav1))/sr, wav1)
    
    ax2 = plt.subplot(2, 2, 3, sharex=ax1)
    wav_stft = librosa.amplitude_to_db(np.abs(librosa.stft(wav1)), ref=np.max)
    librosa.display.specshow(wav_stft, sr=sr, x_axis='time', y_axis='mel')
    
    ax3 = plt.subplot(2, 2, 2)
    if titles:
        ax3.set_title(titles[0])
    ax3.plot(np.arange(len(wav2))/sr, wav2)
    
    ax4 = plt.subplot(2, 2, 4, sharex=ax3)
    wav_stft = librosa.amplitude_to_db(np.abs(librosa.stft(wav2)), ref=np.max)
    librosa.display.specshow(wav_stft, sr=sr, x_axis='time', y_axis='mel')
    
    print(titles[0])
    display(Audio((wav1*2**15).astype(np.int16), rate=sr1))
    print(titles[1])
    display(Audio((wav2*2**15).astype(np.int16), rate=sr2))
    return display()

## Load sample data

The code here inherit from [All-in-one RFCX baseline for beginners](https://www.kaggle.com/c/rfcx-species-audio-detection) with modifications.

In [None]:
with open('/kaggle/input/rfcx-species-audio-detection/train_tp.csv') as f:
    reader = csv.reader(f)
    data = list(reader)
    
wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/train/' + data[10][0] + '.flac', sr=None)

show_signal(wav, sr)

## Test augmentations

Probability $p$ is fixed to 1.0, which means the augmentation always (100%) happens. 

In [None]:
from audiomentations import *

### Do the official example

In [None]:
SAMPLE_RATE = 16000

augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
])

# Generate 2 seconds of dummy audio for the sake of example
samples = np.random.uniform(low=-0.2, high=0.2, size=(32000,)).astype(np.float32)

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=samples, sample_rate=SAMPLE_RATE)
print(augmented_samples)

compare_signals(samples, sr, augmented_samples, sr, titles=["Before", "After"])

### AddBackgroundNoise

In [None]:
# use train data as additive noise
augment = Compose([
    AddBackgroundNoise(sounds_path="../input/rfcx-species-audio-detection/train/", 
                       min_snr_in_db=3, 
                       max_snr_in_db=30, 
                       p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### AddGaussianNoise

In [None]:
# use default values.
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, 
                       max_amplitude=0.015, 
                       p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### AddShortNoises

In [None]:
# use train data as noise
augment = Compose([
    AddShortNoises(sounds_path="../input/rfcx-species-audio-detection/train/",
                   min_snr_in_db=0,
                   max_snr_in_db=24,
                   min_time_between_sounds=4.0,
                   max_time_between_sounds=16.0,
                   burst_probability=0.22,
                   min_pause_factor_during_burst=0.1,
                   max_pause_factor_during_burst=1.1,
                   min_fade_in_time=0.005,
                   max_fade_in_time=0.08,
                   min_fade_out_time=0.01,
                   max_fade_out_time=0.1,
                   p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### ClippingDistortion

In [None]:
augment = Compose([
    ClippingDistortion(min_percentile_threshold=0, 
                       max_percentile_threshold=40, 
                       p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### FrequencyMask

In [None]:
augment = Compose([
    FrequencyMask(min_frequency_band=0.0, 
                  max_frequency_band=0.5, 
                  p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### Gain

In [None]:
augment = Compose([
    Gain(min_gain_in_db=-12, 
         max_gain_in_db=12, 
         p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### Mp3Compression

In [None]:
augment = Compose([
    Mp3Compression(min_bitrate=8, 
                   max_bitrate=64, 
                   backend="pydub",
                   p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### LoudnessNormalization

In [None]:
augment = Compose([
    LoudnessNormalization(min_lufs_in_db=-31, 
                          max_lufs_in_db=-13, 
                          p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### Normalize

In [None]:
augment = Compose([
    Normalize(p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### PitchShift

In [None]:
augment = Compose([
    PitchShift(min_semitones=-4, 
               max_semitones=4, 
               p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### PolarityInversion

In [None]:
augment = Compose([
    PolarityInversion(p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### Resample

In [None]:
augment = Compose([
    Resample(min_sample_rate=8000, 
             max_sample_rate=44100, 
             p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### Shift

In [None]:
augment = Compose([
    Shift(min_fraction=-0.5, max_fraction=0.5, rollover=True, p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### TimeMask

In [None]:
augment = Compose([
    TimeMask(min_band_part=0.0, 
             max_band_part=0.5, 
             fade=False, 
             p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### TimeStretch

In [None]:
augment = Compose([
    TimeStretch(min_rate=0.8, 
                max_rate=1.25, 
                leave_length_unchanged=True, 
                p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])

### Trim
There is no trailing silence in the example. So this is not visible in this case.

In [None]:
augment = Compose([
    Trim(top_db=20, 
         p=1.0)
])

# Augment/transform/perturb the audio data
augmented_samples = augment(samples=wav, sample_rate=sr)

compare_signals(wav, sr, augmented_samples, sr, titles=["Before", "After"])