In [34]:
import torch
import wave
import librosa
import torchaudio.transforms as T
import numpy as np
import io
import random
from pydub import AudioSegment
from IPython.display import Audio
from datasets import load_dataset, Dataset

In [2]:
def add_gaussian_noise(audio, noise_level=0.01):
    waveform = np.array(audio["audio"]["array"], dtype=np.float32)
    noise = np.random.normal(0, noise_level, size=waveform.shape).astype(np.float32)
    noisy_waveform = waveform + noise
    return {
        "audio": {
            "array": noisy_waveform, 
            "sampling_rate": audio["audio"]["sampling_rate"]
            }
    }


In [3]:
def time_stretch(audio, stretch_factor_range=(0.85, 1.15)):
    waveform = np.array(audio["audio"]["array"])
    stretch_factor = np.random.uniform(*stretch_factor_range)
    waveform_stretched = librosa.effects.time_stretch(waveform.astype(np.float32), rate=stretch_factor)
    return {
        "audio": {
            "array": waveform_stretched,
            "sampling_rate": audio["audio"]["sampling_rate"]
        }
    }

In [None]:
# Need to test
def pitch_scale(audio, pitch_factor_range=(-2, 2)):
    waveform = np.array(audio["audio"]["array"])
    sampling_rate = audio["audio"]["sampling_rate"]
    pitch_factor = np.random.uniform(*pitch_factor_range)
    waveform_pitched = librosa.effects.pitch_shift(waveform.astype(np.float32), sr=sampling_rate, n_steps=pitch_factor)
    return {
        "audio": {
            "array": waveform_pitched,
            "sampling_rate": sampling_rate
        }
    }
    
def add_reverb(audio, reverb_factor=0.3):
    waveform = np.array(audio["audio"]["array"], dtype=np.float32)
    reverb_waveform = librosa.effects.preemphasis(waveform, coef=reverb_factor)
    return {
        "audio": {
            "array": reverb_waveform,
            "sampling_rate": audio["audio"]["sampling_rate"]
        }
    }
    
def change_volume(audio, volume_factor_range=(0.5, 1.5)):
    waveform = np.array(audio["audio"]["array"], dtype=np.float32)
    volume_factor = np.random.uniform(*volume_factor_range)
    volume_changed_waveform = waveform * volume_factor
    return {
        "audio": {
            "array": np.clip(volume_changed_waveform, -1.0, 1.0),
            "sampling_rate": audio["audio"]["sampling_rate"]
        }
    }

from scipy.signal import butter, lfilter

def low_pass_filter(audio, cutoff=1000):
    waveform = np.array(audio["audio"]["array"], dtype=np.float32)
    sampling_rate = audio["audio"]["sampling_rate"]
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff / nyquist
    b, a = butter(6, normal_cutoff, btype='low', analog=False)
    filtered_waveform = lfilter(b, a, waveform)
    return {
        "audio": {
            "array": filtered_waveform,
            "sampling_rate": sampling_rate
        }
    }

In [None]:
libris_ds = load_dataset("librispeech_asr", "clean", split="train.100", streaming=True)
n_samples = 10
iterator = iter(libris_ds)
samples = [next(iterator) for _ in range(n_samples)]
libris_ds = Dataset.from_list(samples)

{'file': '374-180298-0000.flac', 'audio': {'array': [0.000701904296875, 0.000732421875, 0.000732421875, 0.000762939453125, 0.000762939453125, 0.000762939453125, 0.000732421875, 0.000762939453125, 0.000701904296875, 0.000732421875, 0.000732421875, 0.000762939453125, 0.00079345703125, 0.000823974609375, 0.000823974609375, 0.000823974609375, 0.00079345703125, 0.000762939453125, 0.000762939453125, 0.000762939453125, 0.000701904296875, 0.000732421875, 0.000701904296875, 0.000732421875, 0.000762939453125, 0.000762939453125, 0.000762939453125, 0.000762939453125, 0.000762939453125, 0.000732421875, 0.000701904296875, 0.00067138671875, 0.000732421875, 0.00067138671875, 0.00067138671875, 0.00067138671875, 0.000732421875, 0.00067138671875, 0.000732421875, 0.000701904296875, 0.000701904296875, 0.00067138671875, 0.000640869140625, 0.000640869140625, 0.0006103515625, 0.000640869140625, 0.000640869140625, 0.00067138671875, 0.000701904296875, 0.00067138671875, 0.00067138671875, 0.000701904296875, 0.000

In [5]:
Audio(data=libris_ds[0]["audio"]["array"], rate=libris_ds[0]["audio"]["sampling_rate"])

In [6]:
noisy_dataset = libris_ds.map(add_gaussian_noise)
Audio(data=noisy_dataset[0]["audio"]["array"], rate=noisy_dataset[0]["audio"]["sampling_rate"])

Map: 100%|██████████| 10/10 [00:01<00:00,  8.02 examples/s]


In [7]:
stretched_dataset = libris_ds.map(time_stretch, batched=False)
Audio(data=stretched_dataset[0]["audio"]["array"], rate=stretched_dataset[0]["audio"]["sampling_rate"])

Map: 100%|██████████| 10/10 [00:02<00:00,  4.89 examples/s]


In [8]:
urbansound_ds = load_dataset("danavery/urbansound8K", split="train", streaming=True)
iterator = iter(urbansound_ds)
samples = [next(iterator) for _ in range(n_samples)]
urbansound_ds = Dataset.from_list(samples)

In [39]:
Audio(data=urbansound_ds[1]["audio"]["array"], rate=urbansound_ds[1]["audio"]["sampling_rate"])

In [None]:
def overlay_audio(speech_sample, sound_sample, mixing_ratio=1):
    speech_audio = np.array(speech_sample['audio']['array'])
    speech_sr = speech_sample['audio']['sampling_rate']
    
    sound_audio = np.array(sound_sample['audio']['array'])
    sound_sr = sound_sample['audio']['sampling_rate']
    
    if sound_sr != speech_sr:
        sound_audio = librosa.resample(sound_audio, orig_sr=sound_sr, target_sr=speech_sr)

    sound_audio = sound_audio * mixing_ratio
    
    mixed_audio = speech_audio.copy()

    sound_audio = sound_audio[:len(speech_audio)] if len(sound_audio) > len(speech_audio) else sound_audio
    mixed_audio[:len(sound_audio)] += sound_audio
    
    if np.max(np.abs(mixed_audio)) > 1.0:
        mixed_audio = mixed_audio / np.max(np.abs(mixed_audio))
        
    return mixed_audio, speech_sr

data, fs = overlay_audio(libris_ds[0], urbansound_ds[1])
Audio(data=data, rate=fs)