## Creation of Multi-Speaker Datasets

In [6]:
import os
import glob
import torch
import random
import shutil
import sys
from speechbrain.dataio.dataio import read_audio, write_audio
import speechbrain as sb
os.chdir("/home/matteo/projects/speechbrain/")

# CLI:
data_folder = sb.parse_arguments(sys.argv[1:])
print(data_folder)

('/home/matteo/.local/share/jupyter/runtime/kernel-099736a4-f4c5-45da-bd54-4641a530ccdc.json', {'debug': False, 'debug_batches': 2, 'debug_epochs': 2, 'device': 'cuda:0', 'data_parallel_count': -1, 'data_parallel_backend': False, 'distributed_launch': False, 'distributed_backend': 'nccl', 'auto_mix_prec': False, 'noprogressbar': False}, '-f')


In [7]:
DATADIR = "/home/matteo/projects/data/5s-LibriSpeech/"
AUDIO_FOLDERS = ["dev-clean", "test-clean", "train-clean-100"]
SAMPLING_RATE = 16000 #hz
MAX_DURATION = 5 #seconds
MAX_SAMPLES = MAX_DURATION * SAMPLING_RATE

In [8]:
xs_speech = read_audio('/home/matteo/projects/data/5s-LibriSpeech/dev-clean/8842-304647-0012-5s-w-padding.flac')
xs_speech = xs_speech.unsqueeze(0) # [batch, time, channels]
print(xs_speech.size())
duration = xs_speech.size()[1] / SAMPLING_RATE
print(f'{duration} second sample')

torch.Size([1, 80000])
5.0 second sample


## Crop all samples to max 5s

In [9]:
# Get list of flac audio files in LibriSpeech
audio_files = [] 
for folder in AUDIO_FOLDERS:
    audio_files.append(glob.glob(f"/home/matteo/projects/data/5s-LibriSpeech/{folder}/*.flac"))

In [32]:
# Get list of unique speaker ids
unique_speakers = [[], [], []]
for folder_idx in range(3):
    for path in audio_files[folder_idx]:
        unique_speakers[folder_idx].append(path.split('/')[-1].split('-')[0])    
    unique_speakers[folder_idx] = list(set(unique_speakers[folder_idx]))
    print(f"{AUDIO_FOLDERS[folder_idx]} unique speakers = {len(unique_speakers[folder_idx])}")

dev-clean unique speakers = 40
test-clean unique speakers = 40
train-clean-100 unique speakers = 251


In [12]:
# # Crop all samples to max(original_duration, 5)
# for i in range(0,3):
#     print(f"Shortening and adding padding to all {AUDIO_FOLDERS[i].split('/')[-1]} samples to create 5s total duration")
#     for recording in audio_files[i]:
#         xs_speech = read_audio(recording)
#         temp = xs_speech.unsqueeze(0)[0][:MAX_SAMPLES] # [batch, time, channels]
#         write_audio(recording[:-5] + "-5s.flac", temp, 16000)

## Add padding to make all clips exactly 5s

In [77]:
# audio_files = [] 
# for folder in AUDIO_FOLDERS:
#     audio_files.append(glob.glob(f"/home/matteo/projects/data/5s-LibriSpeech/{folder}/*.flac"))

# for i in range(0,3):
#     print(f"Adding padding to all {AUDIO_FOLDERS[i].split('/')[-1]} samples to create 5s total duration")
#     for recording in audio_files[i]:
        
#         # Read file
#         xs_speech = read_audio(recording)
#         xs_speech = xs_speech.unsqueeze(0)
        
#         # Add padding
#         padding_tensor = torch.zeros(size=(1, MAX_SAMPLES))
#         padding_tensor[0,:xs_speech.size()[1]] = xs_speech
        
#         # Write file
#         write_audio(recording[:-5] + "-w-padding.flac", padding_tensor.reshape(-1), 16000)

Adding padding to all dev-clean samples to create 5s total duration
Adding padding to all test-clean samples to create 5s total duration
Adding padding to all train-clean-100 samples to create 5s total duration


## Combine 5s audio clips

In [100]:
# Make 1 to 5 overlap folders to store intersecting audio
for folder in AUDIO_FOLDERS:
    for i in range(1, 6):
        new_folder_path = os.path.join(DATADIR, folder + "/", f"{i}-speaker")
        if os.path.exists(new_folder_path):
            shutil.rmtree(new_folder_path)
        os.makedirs(new_folder_path)

# Get random number of unique speaker audio files and combine them (3 times per point, therefore augmenting data 3x)
for folder_id in range(0, 3):
    nb_speakers_log = {"1": 0, "2": 0, "3": 0, "4": 0, "5": 0}
    for iteration in range(0, 3):
        print(f"Creating {AUDIO_FOLDERS[folder_id]} random mixtures ({iteration+1}/3)")
        for path in audio_files[folder_id]:
            mix_files = []
            nb_speakers = random.randint(1, 5)
            nb_speakers_log[str(nb_speakers)] += 1 # Keeping a count of the number of 1-5 speaker signals created.
            speaker_indices = random.sample(unique_speakers[folder_id], nb_speakers)

            # Get n unique files.
            for speaker_idx in speaker_indices:
                samples_w_given_speaker = [audio for audio in audio_files[folder_id] if audio.split('/')[-1].split('-')[0] == speaker_idx]
                mix_files.append(random.choice(samples_w_given_speaker))

            # Read and combine n unique files.
            out = torch.zeros(size=(1, MAX_SAMPLES))
            for flac in mix_files:
#                 random_amplitude = random.uniform(0.5, 1)
                signal = read_audio(flac)
                out += signal/torch.norm(signal)

            # Normalize mixture
#             amp_final = random.uniform(0.5, 1)
#             mix_final = amp_final*out/abs(out)
            mix_final = out/torch.norm(out)
                
            # Output in appropriate folder
            mixture_name = "-".join(speaker_indices) + "-id-" + str(random.randint(0, 1000000)) + ".flac"
            write_audio(f'/home/matteo/projects/data/5s-LibriSpeech/{AUDIO_FOLDERS[folder_id]}/{nb_speakers}-speaker/{mixture_name}', out.reshape(-1), 16000)
            
    print("Speaker mixture counts: ", nb_speakers_log)

Creating dev-clean random mixtures (1/3)
Creating dev-clean random mixtures (2/3)
Creating dev-clean random mixtures (3/3)
{'1': 1535, '2': 1664, '3': 1659, '4': 1631, '5': 1620}
Creating test-clean random mixtures (1/3)
Creating test-clean random mixtures (2/3)
Creating test-clean random mixtures (3/3)
{'1': 1546, '2': 1613, '3': 1537, '4': 1596, '5': 1568}
Creating train-clean-100 random mixtures (1/3)
Creating train-clean-100 random mixtures (2/3)
Creating train-clean-100 random mixtures (3/3)
{'1': 17144, '2': 17143, '3': 17119, '4': 17104, '5': 17107}


## Issue with normalization of mixture

In [99]:
# Normalize mixture
signal1 = read_audio("/home/matteo/projects/data/5s-LibriSpeech/dev-clean/8842-304647-0013-5s-w-padding.flac")
signal2 = read_audio("/home/matteo/projects/data/5s-LibriSpeech/dev-clean/8297-275154-0009-5s-w-padding.flac")

signal1 /= torch.norm(signal1)
signal2 /= torch.norm(signal2)

write_audio(f'/home/matteo/Desktop/normalized-signal-1.flac', signal1.reshape(-1), 16000)
write_audio(f'/home/matteo/Desktop/normalized-signal-2.flac', signal2.reshape(-1), 16000)

mix = signal1 + signal2
mix_final = mix / torch.norm(mix)
write_audio(f'/home/matteo/Desktop/normalized-mix.flac', mix_final.reshape(-1), 16000)

In [96]:
1/torch.norm(signal)

tensor(0.0623)

## Archive

In [89]:
random_amplitude = random.uniform(0.5, 1)
random_amplitude*abs(-torch.rand(size=(1, 90)))

tensor([[0.0280, 0.1364, 0.3073, 0.4672, 0.1843, 0.1835, 0.4672, 0.0245, 0.0334,
         0.1008, 0.3579, 0.4835, 0.3541, 0.2426, 0.4206, 0.3084, 0.5155, 0.0537,
         0.1480, 0.3339, 0.2869, 0.3941, 0.2363, 0.0438, 0.3016, 0.2383, 0.3233,
         0.2327, 0.4076, 0.0942, 0.4946, 0.4827, 0.4047, 0.2229, 0.0310, 0.2693,
         0.2025, 0.5305, 0.1510, 0.0423, 0.3198, 0.1297, 0.5534, 0.3578, 0.3174,
         0.4060, 0.2083, 0.4383, 0.1741, 0.4602, 0.1408, 0.5316, 0.3169, 0.1578,
         0.2806, 0.5180, 0.1265, 0.2656, 0.2718, 0.4946, 0.5432, 0.2318, 0.5598,
         0.1045, 0.3467, 0.1852, 0.5656, 0.1614, 0.0882, 0.3081, 0.1189, 0.5550,
         0.2479, 0.3955, 0.4809, 0.4425, 0.4321, 0.2376, 0.3858, 0.0313, 0.0907,
         0.1604, 0.1318, 0.4905, 0.0131, 0.2096, 0.5452, 0.2597, 0.4411, 0.2101]])

In [None]:
# Create overlapping audio of 2 speakers

# file1 = read_audio('/home/matteo/projects/data/5s-LibriSpeech/dev-clean/8842-304647-0012-5s-w-padding.flac')
# file2 = read_audio('/home/matteo/projects/data/5s-LibriSpeech/dev-clean/5895-34622-0016-5s-w-padding.flac')
# write_audio("/home/matteo/Desktop/mix.flac", (file1+file2).reshape(-1), 16000)