# Audio Augmentation 
This notebook is using Google specaugment techniques for audio augmentation, which was used in speech recognition and some primary audio augmentation, just for experimenting purpose. It is interesting as it is the first time dealing with Audio in deep learning, decided to do some audio augmentation. We have taken a single mp3, but this can be done to the whole dataset.
Augmentation done :
* Time Shift
* Speed Rate  Manipulation
* Frequency Masking
* Time Masking

Importing Libraries

In [None]:
import random
import librosa
import scipy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import IPython.display as ipd
import cv2
import torch
import torchaudio
from torchaudio import transforms

%matplotlib inline

loading Audio file via Librosa

In [None]:
file_path = '../input/birdsong-recognition/train_audio/aldfly/XC134874.mp3'

In [None]:
wav, sr = librosa.load(file_path, sr=None)
print(wav.shape, wav.max(), wav.min())
ipd.Audio(file_path)

In [None]:
def show_spectrogram(wav):
    plotter = librosa.stft(wav, n_fft=480, hop_length=160,win_length=480, window='hamming')
    spect, phase = librosa.magphase(plotter)
    return spect

**Showing spectogram via log **

In [None]:
log_spect = np.log(show_spectrogram(wav))
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.title('spectrogram of origin audio')
plt.show()

# Time Shifting  
> Basic Time shifted according ratio taken by you

In [None]:
start_ = int(np.random.uniform(-18000,18000))
print('time shift: ',start_)
if start_ >= 0:
    wav_time_shift = np.r_[wav[start_:], np.random.uniform(-0.01,0.01, start_)]
else:
    wav_time_shift = np.r_[np.random.uniform(-0.01,0.01, -start_), wav[:start_]]
ipd.Audio(wav_time_shift, rate=sr)


Output

In [None]:
EPS = 1e-8
log_spect = np.log(show_spectrogram(wav_time_shift)+EPS)
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.title('spectrogram of time shifted audio')
plt.show()

# Speed Rate

In [None]:
speed_rate = np.random.uniform(0.7,1.3)
wav_speed_tune = cv2.resize(wav, (1, int(len(wav) * speed_rate))).squeeze()
print('speed rate: %.3f' % speed_rate, '(lower is faster)')
if len(wav_speed_tune) < 1223424:
    pad_len = 1223424 - len(wav_speed_tune)
    wav_speed_tune = np.r_[np.random.uniform(-0.001,0.001,int(pad_len/2)),wav_speed_tune,np.random.uniform(-0.001,0.001,int(np.ceil(pad_len/2)))]
else: 
    cut_len = len(wav_speed_tune) - 1223424
    wav_speed_tune = wav_speed_tune[int(cut_len/2):int(cut_len/2)+1223424]
print('wav length: ', wav_speed_tune.shape[0])
ipd.Audio(wav_speed_tune, rate=sr)

Output

In [None]:
log_spect = np.log(show_spectrogram(wav_speed_tune)+EPS)
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.title('spectrogram of speed tuned audio')
plt.show()

# Loading Audio on pytorch

In [None]:
audio,sr = torchaudio.load(file_path)
sample=(audio,sr)


Converting audio into Melspectrogram

In [None]:

def tfm_spectro(ad, sr=16000, to_db_scale=False, n_fft=1024, 
                ws=None, hop=None, f_min=0.0, f_max=-80, pad=0, n_mels=128):
    # We must reshape signal for torchaudio to generate the spectrogram.
    mel = transforms.MelSpectrogram(sample_rate=ad[1], n_mels=n_mels, n_fft=n_fft, hop_length=hop, 
                                    f_min=f_min, f_max=f_max, pad=pad,)(ad[0].reshape(1, -1))
    mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human.
    if to_db_scale: mel = transforms.AmplitudeToDB(stype='magnitude', top_db=f_max)(mel)
    return mel

spectro = tfm_spectro(sample, ws=512, hop=256, n_mels=128, to_db_scale=True, f_max=8000, f_min=-80)

In [None]:
#displaying
def tensor_to_img(spectrogram): 
    plt.imshow(spectrogram[0],aspect='auto', origin='lower')
    plt.show();
    display(spectrogram.shape)
tensor_to_img(spectro)

# Frequency Masking
Apply masking to a spectrogram in the frequency domain.

In [None]:
def freq_mask(spec, F=250, num_masks=1):
    test = spec.clone()
    num_mel_channels = test.shape[1]
    for i in range(0, num_masks):        
        freq = random.randrange(0, F)
        zero = random.randrange(0, num_mel_channels - freq)
        # avoids randrange error if values are equal and range is empty
        if (zero == zero + freq): return test
        mask_end = random.randrange(zero, zero + freq) 
        test[0][zero:mask_end] = test.mean()
    return test

Output

In [None]:
def test_freq_mask():
    print('Original')
    tensor_to_img(spectro)
    print('5 masks')
    tensor_to_img(freq_mask(spectro, num_masks=5))
test_freq_mask()

# Time Masking
Applying masking in time domain

In [None]:
def time_mask(spec, time=40, num_masks=1):
    test = spec.clone()
    length = test.shape[2]
    for i in range(0, num_masks):
        t = random.randrange(0, time)
        zero = random.randrange(0, length - t)
        if (zero == zero + t): return cloned
        mask_end = random.randrange(zero, zero + t)
        test[0][:,zero:mask_end] = test.mean()
    return test

Output

In [None]:
def test_time_mask():
    print('One Mask')
    tensor_to_img(time_mask(spectro))
    print('Two Mask')
    tensor_to_img(time_mask(spectro, num_masks=2))
test_time_mask()

I have many changes left to do this was my first attempt and was inspired by SpecAugment. Please leave a like and any recommendation in comments.