In [None]:
import os
import re
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import cv2
import matplotlib.pyplot as plt
import IPython.display as ipd


from tqdm import tqdm
from pydub import AudioSegment

import albumentations
from albumentations.core.transforms_interface import DualTransform, BasicTransform

import warnings
warnings.filterwarnings('ignore')

In [None]:
path = f"../input/birdsong-resampled-train-audio-03/norhar2/XC143657.wav"
sample_rate = 16000
sound = AudioSegment.from_wav(path)
sound = sound.set_frame_rate(sample_rate)

data = np.array(sound.get_array_of_samples(), dtype=np.float32), sample_rate

In [None]:
class AudioTransform(BasicTransform):
    """Transform for Audio task"""

    @property
    def targets(self):
        return {"data": self.apply}
    
    def update_params(self, params, **kwargs):
        if hasattr(self, "interpolation"):
            params["interpolation"] = self.interpolation
        if hasattr(self, "fill_value"):
            params["fill_value"] = self.fill_value
        return params

In [None]:
    """It simply add some random value into data by using numpy"""
    def __init__(self, always_apply=False, p=0.5):
        super(NoiseInjection, self).__init__(always_apply, p)
    
    def apply(self, data, noise_levels=(0, 0.5), **params):
        sound, sr = data
        noise_level = np.random.uniform(*noise_levels)
        noise = np.random.randn(len(sound))
        augmented_sound = sound + noise_level * noise
        # Cast back to same data type
        augmented_sound = augmented_sound.astype(type(sound[0]))

        return augmented_sound, sr

In [None]:
class NoiseInjection(AudioTransform):
    """It simply add some random value into data by using numpy"""
    def __init__(self, always_apply=False, p=0.5):
        super(NoiseInjection, self).__init__(always_apply, p)
    
    def apply(self, data, noise_levels=(0, 0.5), **params):
        sound, sr = data
        noise_level = np.random.uniform(*noise_levels)
        noise = np.random.randn(len(sound))
        augmented_sound = sound + noise_level * noise
        # Cast back to same data type
        augmented_sound = augmented_sound.astype(type(sound[0]))

        return augmented_sound, sr

In [None]:
transform = NoiseInjection(p=1.0)
sound_aug, sr = transform(data=data)['data']

plt.plot(data[0])
plt.plot(sound_aug)
plt.show()

display(ipd.Audio(data[0], rate=sr))
display(ipd.Audio(sound_aug, rate=sr))

In [None]:
class ShiftingTime(AudioTransform):
    """Shifting time axis"""
    def __init__(self, always_apply=False, p=0.5):
        super(ShiftingTime, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        shift_max = np.random.randint((len(sound)/sr)/2)
        shift = np.random.randint(sr * shift_max)
        direction = np.random.randint(0,2)
        if direction == 1:
            shift = -shift

        augmented_sound = np.roll(sound, shift)
        # Set to silence for heading/ tailing
        if shift > 0:
            augmented_sound[:shift] = 0
        else:
            augmented_sound[shift:] = 0

        return augmented_sound, sr

    
transform = ShiftingTime(p=1.0)
sound_aug, sr = transform(data=data)['data']

plt.plot(data[0])
plt.plot(sound_aug)
plt.show()

display(ipd.Audio(data[0], rate=sr))
display(ipd.Audio(sound_aug, rate=sr))

In [None]:
class PitchShift(AudioTransform):
    """Shifting time axis"""
    def __init__(self, always_apply=False, p=0.5):
        super(PitchShift, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        n_steps = np.random.randint(-10, 10)
        augmented_sound = librosa.effects.pitch_shift(sound, sr, n_steps)

        return augmented_sound, sr
    
transform = PitchShift(p=1.0)
sound_aug, sr = transform(data=data)['data']

plt.plot(data[0])
plt.plot(sound_aug)
plt.show()

display(ipd.Audio(data[0], rate=sr))
display(ipd.Audio(sound_aug, rate=sr))

In [None]:
class TimeStretch(AudioTransform):
    """Shifting time axis"""
    def __init__(self, always_apply=False, p=0.5):
        super(TimeStretch, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        sound, sr = data

        rate = np.random.uniform(0, 2)
        augmented_sound = librosa.effects.time_stretch(sound, rate)

        return augmented_sound, sr

transform = TimeStretch(p=1.0)
sound_aug, sr = transform(data=data)['data']

plt.plot(data[0])
plt.plot(sound_aug)
plt.show()

display(ipd.Audio(data[0], rate=sr))
display(ipd.Audio(sound_aug, rate=sr))

In [None]:
class MelSpectrogram(AudioTransform):
    """Shifting time axis"""
    def __init__(self, parameters, always_apply=False, p=0.5):
        super(MelSpectrogram, self).__init__(always_apply, p)

        self.parameters = parameters
    
    def apply(self, data, **params):
        sound, sr = data

        melspec = librosa.feature.melspectrogram(sound, sr=sr, **self.parameters)
        melspec = librosa.power_to_db(melspec)
        melspec = melspec.astype(np.float32)

        return melspec, sr
    
    
    
melspectrogram_parameters = {
        "n_mels": 128,
        "fmin": 20,
        "fmax": 16000
    }

transform = MelSpectrogram(parameters=melspectrogram_parameters, p=1.0)

melspec, sr = transform(data=data)['data']

plt.figure(figsize=(20,10))
plt.imshow(melspec)
plt.show()

In [None]:
class SpecAugment(AudioTransform):
    """Shifting time axis"""
    def __init__(self, num_mask=2, freq_masking=0.15, time_masking=0.20, always_apply=False, p=0.5):
        super(SpecAugment, self).__init__(always_apply, p)

        self.num_mask = num_mask
        self.freq_masking = freq_masking
        self.time_masking = time_masking
    
    def apply(self, data, **params):
        melspec, sr = data

        spec_aug = self.spec_augment(melspec, 
                                     self.num_mask,
                                     self.freq_masking,
                                     self.time_masking,
                                     melspec.min())
        


        return spec_aug, sr
    

    def spec_augment(self, 
                    spec: np.ndarray,
                    num_mask=2,
                    freq_masking=0.15,
                    time_masking=0.20,
                    value=0):
        spec = spec.copy()
        num_mask = random.randint(1, num_mask)
        for i in range(num_mask):
            all_freqs_num, all_frames_num  = spec.shape
            freq_percentage = random.uniform(0.0, freq_masking)

            num_freqs_to_mask = int(freq_percentage * all_freqs_num)
            f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
            f0 = int(f0)
            spec[f0:f0 + num_freqs_to_mask, :] = value

            time_percentage = random.uniform(0.0, time_masking)

            num_frames_to_mask = int(time_percentage * all_frames_num)
            t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
            t0 = int(t0)
            spec[:, t0:t0 + num_frames_to_mask] = value

        return spec
    
    
    
transform = SpecAugment(p=1.0)
data = melspec, sr

specAug, sr = transform(data=data)['data']

plt.figure(figsize=(20,10))
plt.imshow(specAug)
plt.show()

In [None]:
class SpectToImage(AudioTransform):

    def __init__(self, always_apply=False, p=0.5):
        super(SpectToImage, self).__init__(always_apply, p)
    
    def apply(self, data, **params):
        image, sr = data
        delta = librosa.feature.delta(image)
        accelerate = librosa.feature.delta(image, order=2)
        image = np.stack([image, delta, accelerate], axis=-1)
        image = image.astype(np.float32) / 100.0

        return image
    
    
transform = SpectToImage(p=1.0)
data = specAug, sr

image = transform(data=data)['data']

plt.figure(figsize=(20,10))
plt.imshow(image)
plt.show()

Show All

In [None]:
# audio_augmentation = albumentations.Compose([
#      RandomAudio(always_apply=True),
#      NoiseInjection(p=1),
#      MelSpectrogram(parameters=melspectrogram_parameters,always_apply=True),
#      SpecAugment(p=1),
#      SpectToImage(always_apply=True)
# ])

# data = np.array(sound.get_array_of_samples(), dtype=np.float32), sample_rate
# image = audio_augmentation(data=data)['data']

# plt.imshow(image)
# plt.show()

In [None]:
class CutOut(AudioTransform):
    def __init__(self, always_apply=False, p=0.5 ):
        super(CutOut, self).__init__(always_apply, p)
        
    def apply(self,data,**params):
        '''
        data : ndarray of audio timeseries
        '''
        start_ = np.random.randint(0,len(data))
        end_ = np.random.randint(start_,len(data))
        
        data[start_:end_] = 0
        
        return data
    


y,sr = librosa.load(path,sr=16000)

print('Audio Intially')
ipd.Audio(y, rate=sr)

In [None]:
transform = CutOut(p=1.0)

print('audio after transform')
ipd.Audio(transform(data=y)['data'],rate=sr)

# Spectrogram Extraction

https://github.com/kahst/BirdCLEF-Baseline/blob/master/utils/

In [None]:
def openAudioFile(path, sample_rate=16000, as_mono=True, mean_substract=False):
    
    # Open file with librosa (uses ffmpeg or libav)
    sig, rate = librosa.load(path, sr=sample_rate, mono=as_mono)

    # Noise reduction?
    if mean_substract:
        sig -= sig.mean()

    return sig, rate

In [None]:
def splitSignal(sig, rate, seconds, overlap, minlen):

    # Split signal with overlap
    sig_splits = []
    for i in range(0, len(sig), int((seconds - overlap) * rate)):
        split = sig[i:i + int(seconds * rate)]

        # End of signal?
        if len(split) < int(minlen * rate):
            break
        
        # Signal chunk too short?
        if len(split) < int(rate * seconds):
            split = np.hstack((split, np.zeros((int(rate * seconds) - len(split),))))
        
        sig_splits.append(split)

    return sig_splits

In [None]:
def melspec(sig, rate, shape=(128, 256), fmin=20, fmax=16000, normalize=True, preemphasis=0.95):

    # shape = (height, width) in pixels

    # Mel-Spec parameters
    SAMPLE_RATE = rate
    N_FFT = shape[0] * 8 # = window length
    N_MELS = shape[0]
    HOP_LEN = len(sig) // (shape[1] - 1)    
    FMAX = fmax
    FMIN = fmin

    # Preemphasis as in python_speech_features by James Lyons
    if preemphasis:
        sig = np.append(sig[0], sig[1:] - preemphasis * sig[:-1])

    # Librosa mel-spectrum
    melspec = librosa.feature.melspectrogram(y=sig, sr=SAMPLE_RATE, hop_length=HOP_LEN, n_fft=N_FFT, n_mels=N_MELS, fmax=FMAX, fmin=FMIN, power=1.0)
    
    # Convert power spec to dB scale (compute dB relative to peak power)
    melspec = librosa.amplitude_to_db(melspec, ref=np.max, top_db=80)

    # Flip spectrum vertically (only for better visialization, low freq. at bottom)
    melspec = melspec[::-1, ...]

    # Trim to desired shape if too large
    melspec = melspec[:shape[0], :shape[1]]

    # Normalize values between 0 and 1
    if normalize:
        melspec -= melspec.min()
        if not melspec.max() == 0:
            melspec /= melspec.max()
        else:
            mlspec = np.clip(melspec, 0, 1)

    return melspec.astype('float32')

In [None]:
def stft(sig, rate, shape=(128, 256), fmin=20, fmax=16000, normalize=True):

    # shape = (height, width) in pixels

    # STFT-Spec parameters
    N_FFT = int((rate * shape[0] * 2) / abs(fmax - fmin)) + 1
    P_MIN = int(float(N_FFT / 2) / rate * fmin) + 1
    P_MAX = int(float(N_FFT / 2) / rate * fmax) + 1    
    HOP_LEN = len(sig) // (shape[1] - 1)

    # Librosa stft-spectrum
    spec = librosa.core.stft(sig, hop_length=HOP_LEN, n_fft=N_FFT, window='hamm')

    # Convert power spec to dB scale (compute dB relative to peak power)
    spec = librosa.amplitude_to_db(librosa.core.magphase(spec)[0], ref=np.max, top_db=80)

    # Trim to desired shape using cutoff frequencies
    spec = spec[P_MIN:P_MAX, :shape[1]]

    # Flip spectrum vertically (only for better visialization, low freq. at bottom)
    spec = spec[::-1, ...]    

    # Normalize values between 0 and 1
    if normalize:
        spec -= spec.min()
        if not spec.max() == 0:
            spec /= spec.max()
        else:
            spec = np.clip(spec, 0, 1)    
    
    return spec.astype('float32')

In [None]:
def get_spec(sig, rate, shape, spec_type='linear', **kwargs):

    if spec_type.lower()== 'melspec':
        return melspec(sig, rate, shape, **kwargs)
    else:
        return stft(sig, rate, shape, **kwargs)

In [None]:
def signal2noise(spec):

    # Get working copy
    spec = spec.copy()

    # Calculate median for columns and rows
    col_median = np.median(spec, axis=0, keepdims=True)
    row_median = np.median(spec, axis=1, keepdims=True)

    # Binary threshold
    spec[spec < row_median * 1.25] = 0.0
    spec[spec < col_median * 1.15] = 0.0
    spec[spec > 0] = 1.0

    # Median blur
    spec = cv2.medianBlur(spec, 3)

    # Morphology
    spec = cv2.morphologyEx(spec, cv2.MORPH_CLOSE, np.ones((3, 3), np.float32))

    # Sum of all values
    spec_sum = spec.sum()

    # Signal to noise ratio (higher is better)
    try:
        s2n = spec_sum / (spec.shape[0] * spec.shape[1] * spec.shape[2])
    except:
        s2n = spec_sum / (spec.shape[0] * spec.shape[1])

    return s2n

In [None]:
def specsFromSignal(sig, rate, shape, seconds, overlap, minlen, **kwargs):

    # Split signal in consecutive chunks with overlap
    sig_splits = splitSignal(sig, rate, seconds, overlap, minlen)

    # Extract specs for every sig split
    for sig in sig_splits:

        # Get spec for signal chunk
        spec = get_spec(sig, rate, shape, **kwargs)

        yield spec

In [None]:
def specsFromFile(path, rate, seconds, overlap, minlen, shape, start=-1, end=-1, **kwargs):

    # Open file
    sig, rate = openAudioFile(path, rate)

    # Trim signal?
    if start > -1 and end > -1:
        sig = sig[int(start * rate):int(end * rate)]
        minlen = 0

    # Yield all specs for file
    for spec in specsFromSignal(sig, rate, shape, seconds, overlap, minlen, **kwargs):
        yield spec
    
if __name__ == '__main__':

    
    for spec in specsFromFile('../input/birdsong-resampled-train-audio-03/norwat/XC120655.wav',
                              rate=44000,
                              seconds=1,
                              overlap=0,
                              minlen=1,
                              shape=(128, 256),
                              fmin=20,
                              fmax=16000,
                              spec_type='melspec'):

        # Calculate and show noise measure
        noise = signal2noise(spec)
        print (noise)

        # Show spec and wait for enter key
        cv2.imshow('SPEC', spec)
        cv2.waitKey(-1)