In [2]:
import torchaudio
import torchaudio.transforms as T
from WaveformAugmentations import WaveformAugmentations
from SpectrogramAugmentations import SpectrogramAugmentations
from IPython.display import Audio
import matplotlib.pyplot as plt
import os

## Waveform augmentations

In [3]:
wav_path = os.path.join("data", "train", "audio", "house", "0ab3b47d_nohash_0.wav")
waveform, sample_rate = torchaudio.load(wav_path)
waveform = waveform.squeeze(0)  # remove channel dimension
Audio(waveform.numpy(), rate=sample_rate)

Time shift

In [None]:
augmented_waveform = WaveformAugmentations.time_shift(waveform, shift_limit=0.4)
Audio(augmented_waveform.numpy(), rate=sample_rate)

Noise addition

In [None]:
augmented_waveform = WaveformAugmentations.add_noise(waveform, noise_level=0.005)
Audio(augmented_waveform.numpy(), rate=sample_rate)

Pitch shift

In [None]:
augmented_waveform = WaveformAugmentations.pitch_shift(waveform, sample_rate)
Audio(augmented_waveform.numpy(), rate=sample_rate)

Volume control

In [None]:
augmented_waveform = WaveformAugmentations.volume_control(waveform, gain_range=(6, 10))
Audio(augmented_waveform.numpy(), rate=sample_rate)

Speed change

In [None]:
augmented_waveform = WaveformAugmentations.speed_change(waveform, sample_rate, speed_range=(0.8, 1.2))
Audio(augmented_waveform.numpy(), rate=sample_rate)

Reverb (echos)

In [None]:
augmented_waveform = WaveformAugmentations.reverb(waveform, sample_rate, reverb_range=(0.5, 0.7))
Audio(augmented_waveform.numpy(), rate=sample_rate)

Mix with background noise

In [None]:
background_path = os.path.join("data", "train", "audio", "_background_noise_", "exercise_bike.wav")
background_waveform, _ = torchaudio.load(background_path)
background_waveform = background_waveform.squeeze(0)  # remove channel dimension

augmented_waveform = WaveformAugmentations.mix_background(waveform, background_waveform, mix_ratio_range=(0.1, 0.3))
Audio(augmented_waveform.numpy(), rate=sample_rate)

Convolution reverb (RIR)

In [None]:
rir_path = os.path.join("data", "train", "audio", "_background_noise_", "running_tap.wav")
rir_waveform, _ = torchaudio.load(rir_path)
rir_waveform = rir_waveform.squeeze(0)  # remove channel dimension
augmented_waveform = WaveformAugmentations.convolution_reverb(waveform, rir_waveform)
Audio(augmented_waveform.numpy(), rate=sample_rate)

## Spectrogram augmentations

In [None]:
mel_spectrogram = T.MelSpectrogram(
    sample_rate=16000,
    n_mels=64,
)
db_transform = T.AmplitudeToDB()

spectrogram = mel_spectrogram(waveform)
spectrogram = db_transform(spectrogram)

plt.figure(figsize=(8, 6))
plt.imshow(spectrogram.squeeze().numpy(), origin="lower", aspect="auto", cmap="viridis")
plt.colorbar()
plt.show()

Time masking

In [None]:
augmented_spectrogram = SpectrogramAugmentations.time_masking(spectrogram, time_mask_param=20)
plt.figure(figsize=(8, 6))
plt.imshow(augmented_spectrogram.squeeze().numpy(), origin="lower", aspect="auto", cmap="viridis")
plt.colorbar()
plt.show()

Frequency masking

In [None]:
augmented_spectrogram = SpectrogramAugmentations.freq_masking(spectrogram, freq_mask_param=20)
plt.figure(figsize=(8, 6))
plt.imshow(augmented_spectrogram.squeeze().numpy(), origin="lower", aspect="auto", cmap="viridis")
plt.colorbar()
plt.show()