In [6]:
from tqdm import tqdm
import os
import soundfile
import time as Time
import librosa
import numpy as np
import pandas as pd

How long to generate all spectrograms from the dataset ?

In [7]:
# Create a directory to store preprocessed files
os.makedirs("preprocessed", exist_ok=True)
os.makedirs("preprocessed/clean_audio", exist_ok=True)
os.makedirs("preprocessed/noisy_audio", exist_ok=True)
os.makedirs("preprocessed/temp", exist_ok=True)

DEV_DATASET = "datasets/LibriSpeech/dev-clean"
TRAIN_DATASET = "datasets/LibriSpeech/train-clean-100"
dataset_path = DEV_DATASET
audio_files_list = []
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.endswith(".flac"):
            fullpath = os.path.join(root, file)
            audio_files_list.append(fullpath)
print(f"There is a total of {len(audio_files_list)} audio files in the dataset.")

n_benchmark_samples = 256
benchmark_sample = audio_files_list[:n_benchmark_samples]


fwritten = 0
def store_stft(dirpath: str, audio_file: str, n_fft=2048, hop_length=None):
    t_start = Time.time()
    if hop_length is None:
        hop_length = n_fft // 4
    signal, samplerate = soundfile.read(audio_file)
    n_segments = len(signal) // samplerate
    for index in range(n_segments):
        segment = signal[index : index + 1]
        stft_signal = librosa.stft(segment, n_fft=n_fft, hop_length=hop_length)
        stft_signal.shape
        magnitude = np.abs(stft_signal)
        filepath = hex(np.abs(hash(magnitude.tobytes())))
        fullpath = os.path.join(dirpath, filepath)
        if os.path.exists(fullpath):
            raise Exception(f"Hash collision for file : {filepath}.")
        np.save(fullpath, magnitude)
        global fwritten
        fwritten += 1
    t_stop = Time.time()
    return t_stop - t_start


computing_times = np.zeros(len(benchmark_sample))
for index, audio_file in enumerate(benchmark_sample):
    ttc = store_stft("preprocessed/temp", audio_file)
    computing_times[index] = ttc


avg_ttc = np.mean(computing_times)
print(f"Avergage time to compute and save STFT : {avg_ttc:.3f}s.")
print(f"Predicted time for the whole dataset : {len(audio_files_list)*avg_ttc:.3f}s")

start = Time.time()

if False: # Change to `True` to store all the stfts
    for index in tqdm(range(len(audio_files_list))):
        audio_file = audio_files_list[index]
        store_stft("preprocessed/clean_audio", audio_file)
    stop = Time.time()
    print(f"Total time : {stop-start:.3f}s.")
    print(f"Wrote {fwritten} files.")

There is a total of 2703 audio files in the dataset.




Avergage time to compute and save STFT : 0.003s.
Predicted time for the whole dataset : 9.174s


100%|██████████| 2703/2703 [00:11<00:00, 227.66it/s]

Total time : 11.874s.
Wrote 19404 files.





#### Now we need to create the noisy audios
According to the README, these are the mapping between the target number and the class of each sound:
- target // 10 == 0 => Animals
- target // 10 == 1 => Natural soundscapes & water sounds
- target // 10 == 2 => Human, non-speech sounds
- target // 10 == 3 => Interior/domestic sounds
- target // 10 == 4 => Exterior/urban noises

What we need to do is first to create five folder for each class.
Then for each file in the whole dataset, we will cut it in 1s segments and put these segment in the folder corresponding to the class of the segment.
This will give us some basis to create a training dataset which will consist of the speech recordings to which we will and some noise from 1 or more classes from the dataset.

This also raise the question of the magnitude of the noise that will be added.
It should be high enough to train the model on useful situations but low enough to avoid polluting the audio.
Although maybe this won't make a difference for example if the model rely significately more of the frequency than the magnitude to detect the noise. One thing to consider is that the magnintude of the added noise should be random to prevent the model from potentially detecting noise only by looking at the magnitude. 



In [8]:
NOISE_DATASET = "datasets/ESC-50-master/"
audio_dir = os.path.join(NOISE_DATASET, "audio")
labels_path = os.path.join(NOISE_DATASET,"meta/esc50.csv")
labels = pd.read_csv(labels_path)
labels.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A
