<a href="https://colab.research.google.com/github/satvik-venkatesh/train-synth-audio-seg/blob/main/train-set-synthesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This file synthesises training sets for the paper titled "Training Set Synthesis for Audio Segmentation and Classification of Music and Speech in Radio Broadcast" submitted to the Journal Electronics.

In [None]:
!pip install numba==0.48

In [None]:
!pip install soundfile
!sudo apt-get install sox
!pip install pyloudnorm
!pip install librosa==0.7.2
from subprocess import Popen, PIPE
import glob
import numpy as np
import random
import soundfile as sf
import pprint
import json
import pickle
from shutil import copyfile
import os
import librosa

In [None]:
"""
Mount Google Drive into Colab.
"""
from google.colab import drive
drive.mount('/content/drive')

# Dowload all the datasets containing individual files of Music and Speech
We do not have permission to share the datasets. However, all of them are either openly available or can be obtained from the authors of the datasets. After obtaining the dataset, it is a good idea a to store the .zip file in your personal google drive and use a !wget command to download it into this notebook. The transfer speed of Google drive is generally better than the original file server. Also, extracting the zip file directly from Google drive is slow for large files (> 5 GB).

In [None]:
"""
Download the Musan database.
"""
file_size = 0
while file_size < 10:
  !wget http://www.openslr.org/resources/17/musan.tar.gz
  file_size = os.path.getsize("musan.tar.gz") / (1024 ** 2)
  print("file_size is {}".format(file_size))

In [None]:
!tar -xvf  "/content/musan.tar.gz" -C "/content"
print("Completely extracted the files!")
# Scheirer and Slaney

In [None]:
"""
Remove all silences and resample the sound files.
"""
import glob
music_list = glob.glob('/content/musan/music/**/*.wav', recursive = True)
print("The contents of the music_list are: {}".format(music_list))

for sound in music_list:
  temp_file = sound.replace('.wav', '_t.wav').replace('.WAV', '_t.WAV')
  command = "sox " + sound + " " + temp_file + " rate 22050 silence -l 1 0.1 1% -1 0.1 1%"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()
  copyfile(temp_file, sound)
  os.remove(temp_file)


speech_list = glob.glob('/content/musan/speech/**/*.wav', recursive = True)
print("The contents of the speech_list are: {}".format(speech_list))

for sound in speech_list:
  temp_file = sound.replace('.wav', '_t.wav').replace('.WAV', '_t.WAV')
  command = "sox " + sound + " " + temp_file + " rate 22050 silence -l 1 0.1 1% -1 0.1 1%"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()
  copyfile(temp_file, sound)
  os.remove(temp_file)


noise_list = glob.glob('/content/musan/noise/**/*.wav', recursive = True)
print("The contents of the noise_list are: {}".format(noise_list))


for sound in noise_list:
  temp_file = sound.replace('.wav', '_t.wav').replace('.WAV', '_t.WAV')
  command = "sox " + sound + " " + temp_file + " rate 22050" # silence -l 1 0.1 1% -1 0.1 1%"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()
  copyfile(temp_file, sound)
  os.remove(temp_file)

# If there are files shorter than 9.1 s, loop them 4 times to increase their lengths.

for sound in noise_list:
  d, sr = sf.read(sound)
  t = float(d.shape[0]) / sr
  if t < 9.1:
    temp_file = sound.replace('.wav', '_t.wav').replace('.WAV', '_t.WAV')
    command = "sox " + sound + " " + temp_file + " repeat 4"
    p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
    output, err = p.communicate()
    copyfile(temp_file, sound)
    os.remove(temp_file)

MUSAN is good for initial testing. To expand the data repository, you could download the [GTZAN music-speech](http://marsyas.info/downloads/datasets.html), [GTZAN Genre collection](http://marsyas.info/downloads/datasets.html), [Scheirer & Slaney](https://labrosa.ee.columbia.edu/sounds/musp/scheislan.html), [Instrument Recognition in Musical Audio Signals](https://www.upf.edu/web/mtg/irmas#:~:text=IRMAS%20is%20intended%20to%20be,violin%2C%20and%20human%20singing%20voice.), [Singing Voice dataset](http://isophonics.net/SingingVoiceDataset), and  [LibriSpeech (train-clean-100, dev-other)](http://www.openslr.org/12/). Please format the examples such that music and speech are stored in different folders. If the duration of the file is less than 9.1 s, loop the audio to get the required duration. This can be easily done using following command in SoX: "sox " + sound + " " + temp_file + " repeat 3"

In [None]:
print(len(music_list))
print(len(speech_list))
print(len(noise_list))

In [None]:
music_files = glob.glob('/content/musan/music/**/*.wav', recursive = True)
music_files.sort()
speech_files = glob.glob('/content/musan/speech/**/*.wav', recursive = True)
speech_files.sort()

noise_files = glob.glob('/content/musan/noise/**/*.wav', recursive = True)
noise_files.sort()

music_files_filt = []
speech_files_filt = []
noise_files_filt = []

min_dur = int(9.1 * 22050)

for m in music_files:
  a, sr = sf.read(m)
  if a.shape[0] >= min_dur:
    music_files_filt.append(m)

for s in speech_files:
  a, sr = sf.read(s)
  if a.shape[0] >= min_dur:
    speech_files_filt.append(s)

for s in noise_files:
  a, sr = sf.read(s)
  if a.shape[0] >= min_dur:
    noise_files_filt.append(s)



In [None]:
music_files_filt.sort()
speech_files_filt.sort()
noise_files_filt.sort()

print(len(music_files_filt))
print(len(speech_files_filt))
print(len(noise_files_filt))

random.seed(4)
random.shuffle(music_files_filt)
random.shuffle(speech_files_filt)
random.shuffle(noise_files_filt)

m_music = len(music_files_filt)
m_speech = len(speech_files_filt)
m_noise = len(noise_files_filt)

split_music = int(0.8 * m_music)
split_speech = int(0.8 * m_speech)
split_noise = int(0.8 * m_noise)

# To synthesise training set
music_list = music_files_filt[0:split_music]
speech_list = speech_files_filt[0:split_speech]
noise_list = noise_files_filt[0:split_noise]

"""
To synthesise validation set, uncomment the below lines
(The original paper used manually annotated radio recordings as the validation set).
"""
# music_list = music_files_filt[split_music:]
# speech_list = speech_files_filt[split_speech:]
# noise_list = noise_files_filt[split_noise:] 


In [None]:
print(len(music_list))
print(len(speech_list))
print(len(noise_list))
print(speech_list[0:20])

In [None]:
random.seed()

# Below is the process to synthesise data.

There are two types of examples --- (1) examples without background music and (2) examples with background music

## Synthesis with background music

In [None]:
"""
This is the mixed (includes music + speech) version of create_transition
"""
def create_mixed_transition(max_f_out_dur = 1.0, max_f_in_dur = 1.0, max_c_fade_dur = 1.0, audio_clip_length = 8.0, min_segment_length = 1.0):
    transition = {}
    transition['type'] = random.choice(["music+speech", "speech_to_music+speech", "music_to_music+speech", "music+speech_to_music", "music+speech_to_speech"])
    if transition['type'] == "speech_to_music+speech":
      transition['music_gain'] = np.random.uniform(0.3, 0.7)
      # transition['music_gain'] is a dummy value. It is set again later according to the loudness normalization.

      transition['f_in_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
      transition['f_in_dur'] = np.random.uniform(0, max_f_in_dur)
      
      if transition['f_in_curve'] == "exp-convex" or transition['f_in_curve'] == "exp-concave" or transition['f_in_curve'] == "s-curve":
        transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.
        
        
    elif transition['type'] == "music_to_music+speech":
      transition['music_gain_1'] = 1.0
      #transition['music_gain_1'] = np.random.uniform(0.7, 1.0)
      transition['music_gain_2'] = np.random.uniform(0.3, 0.7)
      # transition['music_gain_2'] is a dummy value. It is set again later according to the loudness normalization.

      transition['f_in_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
      transition['f_in_dur'] = np.random.uniform(0, max_f_in_dur)
      
      if transition['f_in_curve'] == "exp-convex" or transition['f_in_curve'] == "exp-concave" or transition['f_in_curve'] == "s-curve":
        transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

      transition['f_out_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
      transition['f_out_dur'] = np.random.uniform(0, max_f_out_dur)
     
      if transition['f_out_curve'] == "exp-convex" or transition['f_out_curve'] == "exp-concave" or transition['f_out_curve'] == "s-curve":
          transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

    elif transition['type'] == "music+speech_to_music":
      transition['music_gain_1'] = np.random.uniform(0.3, 0.7)
      # transition['music_gain_1'] is a dummy value. It is set again later according to the loudness normalization.
      transition['music_gain_2'] = 1.0

      transition['f_out_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
      transition['f_out_dur'] = np.random.uniform(0, max_f_out_dur)
     
      if transition['f_out_curve'] == "exp-convex" or transition['f_out_curve'] == "exp-concave" or transition['f_out_curve'] == "s-curve":
          transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

      transition['f_in_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
      transition['f_in_dur'] = np.random.uniform(0, max_f_in_dur)
      
      if transition['f_in_curve'] == "exp-convex" or transition['f_in_curve'] == "exp-concave" or transition['f_in_curve'] == "s-curve":
        transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

    elif transition['type'] == "music+speech_to_speech":
      transition['music_gain'] = np.random.uniform(0.3, 0.7)

      transition['f_out_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
      transition['f_out_dur'] = np.random.uniform(0, max_f_out_dur)

      if transition['f_out_curve'] == "exp-convex" or transition['f_out_curve'] == "exp-concave" or transition['f_out_curve'] == "s-curve":
          transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

    elif transition['type'] == "music+speech":
      transition['music_gain'] = np.random.uniform(0.3, 0.7)
      # transition['music_gain'] is a dummy value. It is set again later according to the loudness normalization.

    # ======= Calculate the time of transition in the same function ========
    if transition['type'] == "music+speech":
      return (transition, -1.0)
    
    else:
      point = np.random.uniform(min_segment_length + max_f_out_dur, audio_clip_length - min_segment_length - max_f_in_dur)
      return (transition, point)

In [None]:
def create_mixed_samples_list(music_sounds, speech_sounds):
    """
    Returns a dictionary containing music and speech sounds.
    Take the class_list as input and randomly pick sound files in the `music_sounds` and `speech_sounds` folder.
    """
    samples = {}

    cc = random.choice(music_sounds)
    samples['music'] = cc

    cc = random.choice(speech_sounds)
    samples['speech'] = cc

    return samples

In [None]:
def get_mixed_segment_lengths(transition, audio_clip_length=8.0, sr = 22050):
  """
  This function returns a dictionary.
  """
  segment_lengths = {}

  ac_len_samples = int(audio_clip_length * sr)
  t_samples = int(transition[1] * sr) # Transition time in samples

  if transition[0]['type'] == "music+speech":
    segment_lengths['music'] = ac_len_samples
    segment_lengths['speech'] = ac_len_samples

  elif transition[0]['type'] == "speech_to_music+speech":
    segment_lengths['speech'] = ac_len_samples
    segment_lengths['music'] = ac_len_samples - t_samples

  elif transition[0]['type'] == "music_to_music+speech":
    segment_lengths['speech'] = ac_len_samples - t_samples
    segment_lengths['music'] = ac_len_samples

  elif transition[0]['type'] == "music+speech_to_music":
    segment_lengths['music'] = ac_len_samples
    segment_lengths['speech'] = t_samples

  elif transition[0]['type'] == "music+speech_to_speech":
    segment_lengths['speech'] = ac_len_samples
    segment_lengths['music'] = t_samples

  return segment_lengths

In [None]:
def get_mixed_random_segments(samples, segment_lengths, f_buffer = 0.0, sr = 22050):
    """
    This function returns a dictionary of tuples that specifies the segment boundaries in the original sound file.
    """   

    f_buffer_samples = int(f_buffer * sr)

    segments = {}     
    d, sr = sf.read(samples['speech'])
    sample_length = len(d)
    r = np.random.randint(f_buffer_samples, sample_length - segment_lengths['speech'] - f_buffer_samples)
    segments['speech'] = (r, r + segment_lengths['speech'])
    
    d, sr = sf.read(samples['music'])
    sample_length = len(d)
    r = np.random.randint(f_buffer_samples, sample_length - segment_lengths['music'] - f_buffer_samples)
    segments['music'] = (r, r + segment_lengths['music'])

    return segments

In [None]:
def apply_mixed_fade_out(audio, transition, sr=22050.0, end_gain = 0.0):
  stop = audio.shape[0]
  f_out_length_samples =  int(transition[0]['f_out_dur'] * sr)

  if transition[0]['f_out_curve'] == "linear":     
      audio[stop - f_out_length_samples:stop] = audio[stop - f_out_length_samples:stop] * np.linspace(1.0, end_gain, num = f_out_length_samples)

  elif transition[0]['f_out_curve'] == "exp-concave":
      a = np.linspace(1.0, 0.0, num = f_out_length_samples)
      x = transition[0]['exp_value']
      fade_curve = a ** x
      fade_curve = fade_curve * (1 - end_gain) + end_gain
      audio[stop - f_out_length_samples:stop] = audio[stop - f_out_length_samples:stop] * fade_curve
      
  elif transition[0]['f_out_curve'] == "exp-convex":
      a = np.linspace(0.0, 1.0, num = f_out_length_samples)
      x = transition[0]['exp_value']
      fade_curve = 1 - a ** x
      fade_curve = fade_curve * (1 - end_gain) + end_gain
      audio[stop - f_out_length_samples:stop] = audio[stop - f_out_length_samples:stop] * fade_curve
      
  elif transition[0]['f_out_curve'] == "s-curve":
      n_1 = int(f_out_length_samples / 2)
      a_1 = np.linspace(0.0, 1.0, num = n_1)
      a_2 = np.linspace(0.0, 1.0, num = f_out_length_samples - n_1)
      x = transition[0]['exp_value']
      
      convex = 0.5 * (1 - a_1 ** x) + 0.5
      
      concave = 0.5 * (1 - a_2)  ** x
      
      fade_curve = np.concatenate((convex, concave))
      fade_curve = fade_curve * (1 - end_gain) + end_gain
      
      audio[stop - f_out_length_samples:stop] = audio[stop - f_out_length_samples:stop] * fade_curve

In [None]:
def apply_mixed_normal_fade_in(audio, transition, sr=22050.0, end_gain = 1.0, start_gain = 0.0):
  start = 0
  f_in_length_samples =  int(transition[0]['f_in_dur'] * sr)  

  #print('f_in_length_samples is {}'.format(f_in_length_samples))

  #print("audio.shape is {}".format(audio.shape))

  if transition[0]['f_in_curve'] == "linear":        
    audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * np.linspace(start_gain, end_gain, num = f_in_length_samples)      

  elif transition[0]['f_in_curve'] == "exp-concave":
    a = np.linspace(0.0, 1.0, num = f_in_length_samples)
    x = transition[0]['exp_value']
    fade_curve = a ** x
    fade_curve = fade_curve * (end_gain - start_gain) + start_gain
    audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * fade_curve
      
  elif transition[0]['f_in_curve'] == "exp-convex":
    a = np.linspace(1.0, 0.0, num = f_in_length_samples)
    x = transition[0]['exp_value']
    fade_curve = 1 - a ** x
    fade_curve = fade_curve * (end_gain - start_gain) + start_gain
    audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * fade_curve
      
  elif transition[0]['f_in_curve'] == "s-curve":
    n_1 = int(f_in_length_samples / 2)
    a_1 = np.linspace(0.0, 1.0, num = n_1)
    a_2 = np.linspace(0.0, 1.0, num = f_in_length_samples - n_1)
    x = transition[0]['exp_value']
    
    concave = 0.5 * a_1 ** x
    
    convex = 0.5 * (1 - (1 - a_2)  ** x) + 0.5
    
    fade_curve = np.concatenate((concave, convex))
    fade_curve = fade_curve * (end_gain - start_gain) + start_gain
    audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * fade_curve

In [None]:
def generate_mixed_multiclass_labels(transition, audio_clip_length = 8.0, sr = 22050.0, res = 220):
  """
  This function generates multiclass labels for music+speech examples.
  `res` is in samples.
  """
  t_point = transition[1]
  
  # Format of labels is [speech(on/off), start_time, stop_time, music(on/off), start_time, stop_time]
  # New universal format of labels
  labels = []

  """
  "music+speech", "speech_to_music+speech", "music_to_music+speech", "music+speech_to_music", "music+speech_to_speech"
  """

  if transition[0]['type'] == "music+speech":
    labels.append([0.0, 8.0, "speech"])
    labels.append([0.0, 8.0, "music"])

  elif transition[0]['type'] == "speech_to_music+speech":
    labels.append([0.0, 8.0, "speech"])
    labels.append([t_point, 8.0, "music"])

  elif transition[0]['type'] == "music_to_music+speech":
    labels.append([0.0, 8.0, "music"])
    labels.append([t_point, 8.0, "speech"])

  elif transition[0]['type'] == "music+speech_to_music":
    labels.append([0.0, t_point, "speech"])
    labels.append([0.0, 8.0, "music"])

  elif transition[0]['type'] == "music+speech_to_speech":
    labels.append([0.0, 8.0, "speech"])
    labels.append([0.0, t_point, "music"])

  return labels

In [None]:
"""
This would create an audio clip template based on the output of create_mixed_transition.
"""
"""
"music+speech", "speech_to_music+speech", "music_to_music+speech", "music+speech_to_music", "music+speech_to_speech"
"""

def create_mixed_audio_clip(audio_clip_length = 8.0, sr = 22050.0):
  transition = create_mixed_transition()
  #print(transition)
  samples = create_mixed_samples_list(music_sounds, speech_sounds)
  segment_lengths = get_mixed_segment_lengths(transition)
  segments = get_mixed_random_segments(samples, segment_lengths)

  start_sp = segments['speech'][0]
  stop_sp = segments['speech'][1]

  start_mu = segments['music'][0]
  stop_mu = segments['music'][1]

  if transition[0]['type'] == "music+speech":
    synth_audio, _ = sf.read(samples['speech'], start = start_sp, stop = stop_sp)
    synth_audio = librosa.util.normalize(synth_audio)

    synth_music, _ = sf.read(samples['music'], start = start_mu, stop = stop_mu)
    synth_music = librosa.util.normalize(synth_music)

    m_gain = get_random_loudness_gain(synth_audio, synth_music)

    synth_audio += m_gain * synth_music

  elif transition[0]['type'] == "speech_to_music+speech":
    synth_audio, _ = sf.read(samples['speech'], start = start_sp, stop = stop_sp)
    synth_audio = librosa.util.normalize(synth_audio)

    music_start_point = int(transition[1] * sr)

    synth_music, _ = sf.read(samples['music'], start = start_mu, stop = stop_mu)
    synth_music = librosa.util.normalize(synth_music)

    m_gain = get_random_loudness_gain(synth_audio, synth_music)

    apply_mixed_normal_fade_in(synth_music, transition, sr=22050.0, end_gain = m_gain)

    f_in_length_samples =  int(transition[0]['f_in_dur'] * sr)  

    synth_music[f_in_length_samples:] = synth_music[f_in_length_samples:] * m_gain

    synth_audio[music_start_point:] += synth_music

  elif transition[0]['type'] == "music_to_music+speech":
    synth_audio, _ = sf.read(samples['music'], start = start_mu, stop = stop_mu)
    synth_audio = librosa.util.normalize(synth_audio)
    
    music1_end_point = int(transition[1] * sr)
    synth_music1 = synth_audio[0:music1_end_point]
    synth_music1 = synth_music1 * transition[0]['music_gain_1']

    synth_speech, _ = sf.read(samples['speech'], start = start_sp, stop = stop_sp)
    synth_speech = librosa.util.normalize(synth_speech)

    apply_mixed_normal_fade_in(synth_speech, transition, sr=22050.0)

    m_gain = get_random_loudness_gain(synth_speech, synth_audio)
    apply_mixed_fade_out(synth_music1, transition, sr=22050.0, end_gain = m_gain)

    synth_music2 = synth_audio[music1_end_point:]
    synth_music2 = synth_music2 * m_gain

    synth_audio[0:music1_end_point] = synth_music1
    synth_audio[music1_end_point:] = synth_speech + synth_music2

  elif transition[0]['type'] == "music+speech_to_music":
    synth_audio, _ = sf.read(samples['music'], start = start_mu, stop = stop_mu)
    synth_audio = librosa.util.normalize(synth_audio)

    speech_start_point = int(transition[1] * sr)

    synth_speech, _ = sf.read(samples['speech'], start = start_sp, stop = stop_sp)
    synth_speech = librosa.util.normalize(synth_speech)

    apply_mixed_fade_out(synth_speech, transition, sr=22050.0)

    music1_end_point = int(transition[1] * sr)
    synth_music1 = synth_audio[0:music1_end_point]

    m_gain = get_random_loudness_gain(synth_speech, synth_audio)

    synth_music1 = synth_music1 * m_gain

    synth_music2 = synth_audio[music1_end_point:]
    f_in_length_samples =  int(transition[0]['f_in_dur'] * sr)
    apply_mixed_normal_fade_in(synth_music2, transition, sr=22050.0, start_gain = m_gain, end_gain = transition[0]['music_gain_2'])
    synth_music2[f_in_length_samples:] = synth_music2[f_in_length_samples:] * transition[0]['music_gain_2']

    synth_audio[0:music1_end_point] = synth_speech + synth_music1

    synth_audio[music1_end_point:] = synth_music2

  elif transition[0]['type'] == "music+speech_to_speech":
    synth_audio, _ = sf.read(samples['speech'], start = start_sp, stop = stop_sp)
    synth_audio = librosa.util.normalize(synth_audio)

    music_end_point = int(transition[1] * sr)

    synth_music, _ = sf.read(samples['music'], start = start_mu, stop = stop_mu)
    synth_music = librosa.util.normalize(synth_music)

    m_gain = get_random_loudness_gain(synth_audio, synth_music)

    synth_music = synth_music * m_gain
    apply_mixed_fade_out(synth_music, transition, sr=22050.0)

    synth_audio[0:music_end_point] += synth_music

  return (synth_audio, transition)

In [None]:
def get_log_melspectrogram(audio, sr = 22050, hop_length = 220, n_fft = 1024, n_mels = 80, fmin = 64, fmax = 8000):
    """Return the log-scaled Mel bands of an audio signal."""
    bands = librosa.feature.melspectrogram(
        y=audio, sr=sr, hop_length=hop_length, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, dtype=np.float32)
    return librosa.core.power_to_db(bands, amin=1e-7)

In [None]:
def get_spectrogram(audio, sr = 22050, hop_length = 220, n_fft = 1024, n_mels = 80, fmin = 64, fmax = 8000):
    """Return the power magnitude spectrogram of the audio."""
    S = np.abs(librosa.stft(audio, hop_length=hop_length, n_fft=n_fft))**2
    # print(S.dtype)
    return S

In [None]:
"""
A function to extract mel spectrograms. 
I am going to store db values after calculating the melspectrogram. 
"""
def extract_mel_spec(sound_files, sound_labels, sr = 22050, hop_length = 220, n_fft = 512): 
  # `segments_mel_spec` contains the extracted mel spectrograms.
  

  # Calculate mel spectrogram of first segment.
  s0 = sound_files[0].flatten()
  mel_spec0 = librosa.feature.melspectrogram(y=s0, sr=sr, hop_length=hop_length, n_fft=n_fft, fmin = 64, fmax = 8000, n_mels = 80)
  db_bands = librosa.core.power_to_db(mel_spec0, amin=1e-7, ref = np.max)
  d_n = db_bands.T
  mel_spec0 = d_n
  # D = librosa.stft(s0, hop_length=hop_length, n_fft=n_fft)
  # magnitude, phase = librosa.magphase(D)
  # print("magnitude.shape is {}".format(magnitude.shape ))
  # print("phase.shape is {}".format(phase.shape ))
  # ang_phase = np.angle(phase)
  # print("ang_phase.shape is {}".format(ang_phase.shape))
  # mel_spec0 = np.concatenate((mel_spec0, ang_phase), axis = 0)

  # Load the first label
  y0 = sound_labels[0]

  m = sound_files.shape[0]
  (n, o) = mel_spec0.shape
  segments_mel_spec = np.zeros((m, n, o), dtype='float32')
  segments_mel_spec[0, :, :] = mel_spec0

  (p, q) = y0.shape
  labels = np.zeros((m, p, q), dtype = np.int16)
  labels[0, :, :] = y0

  for i in range(1, m):
    s = sound_files[i].flatten()
    mel_spec = librosa.feature.melspectrogram(y=s, sr=sr, hop_length=hop_length, n_fft=n_fft, fmin = 64, fmax = 8000, n_mels = 80)
    # D = librosa.stft(s, hop_length=hop_length, n_fft=n_fft)
    # magnitude, phase = librosa.magphase(D)
    # ang_phase = np.angle(phase)
    # mel_spec = np.concatenate((mel_spec, ang_phase), axis = 0)
    db_bands = librosa.core.power_to_db(mel_spec, amin=1e-7, ref = np.max)
    d_n = db_bands.T

    #mel_spec = mel_spec.T
    segments_mel_spec[i, :, :] = d_n

    y = sound_labels[i]
    labels[i, :, :] = y

  return segments_mel_spec, labels

In [None]:
"""
This function is used to divide the synthesised into different folders called blocks.
"""

def get_block_id(mel_id, block_size = 320):
  i = int(mel_id.replace("mel-id-", ""))
  b = int((i - 1) // block_size)
  return b + 1

In [None]:
def multi_to_cat(labels):
  (n, o) = labels.shape
  for j in range(0, n):
    if ((labels[j] == np.array([1, 1, 0, 0])).all()):
      labels[j] = np.array([0, 0, 1, 0])

    elif ((labels[j] == np.array([0, 0, 0, 0])).all()):
      labels[j] = np.array([0, 0, 0, 1]) 

In [None]:
import os

def synthesise_mixed_audio_examples(no_of_examples, music_dir, speech_dir, mel_dir, sr = 22050, audio_clip_length = 8.0, batch_size = 1, offset = 0):
    """
    This function synthesised audio examples and stores them in a directory.
    """           
    no_of_batches = int(np.floor(no_of_examples / batch_size))

    for b in range(no_of_batches):

      l = int(audio_clip_length * sr)
      batch_audio = np.zeros((batch_size, l))
      #print("Shape of batch_audio = {}".format(batch_audio.shape))

      res_t = 220 / sr
      no_of_labels = int(np.ceil(audio_clip_length / res_t))
      batch_labels = np.zeros((batch_size, no_of_labels, 2))


      for i in range(batch_size):
          
          # p = create_random_transition_points(audio_clip_length, 1.0) # Create random transition points.
          
          # transitions_list = create_transition_list(p) # Create a list of transitions.
          
          # class_list = create_alternating_class_list(len(p) + 1) # Create a list of alternating classes for the segments.
          
          # Load the files in the directories into lists.
          # speech_sounds = speech_list
          # music_sounds = music_list
          
          # samples_list = create_samples_list(class_list, music_sounds, speech_sounds) # Create a list of randomly selected sounf files from the `music_sounds` and `speech_sounds` folder
          
          # segment_lengths = get_segment_lengths(transitions_list, audio_clip_length)
          
          # random_segments = get_random_segments(samples_list, segment_lengths)
          
          synth_audio, transition = create_mixed_audio_clip()
          
          
          # sf.write(file_name, synth_audio, sr)

          #print("batch_audio.shape is = {}".format(synth_audio.shape))
          batch_audio[i, :] = synth_audio
          
          
          
          batch_labels[i, :, :] = generate_mixed_multiclass_labels(transition)

          #np.save(n_DbS_label, DbS_label)
      mel_id = 'mel-id-' + str(b + 1 + offset)
      e, l = extract_mel_spec(batch_audio, batch_labels)

      pp = mel_dir + '/block-id-' + str(get_block_id(mel_id) + '/')

      if not os.path.isdir(pp):
        os.mkdir(pp)

      np.save(pp + 'mel-id-' + str(b + 1 + offset) + '.npy', e.reshape((802, 128)))
      np.save(pp + 'mel-id-label-' + str(b + 1 + offset) + '.npy', l.reshape((802, 2)))

## Synthesis without background music

In [None]:
def check_overlap(transition_points, point, min_segment_length):
    is_overlap = False
    for t in transition_points:
        if np.absolute(point - t) <= min_segment_length + 2.0: # I am adding 4.0 to separate out the files.
            is_overlap = True
    return is_overlap

def create_random_transition_points(audio_clip_length, min_segment_length = 1.0, max_f_out_dur = 0.5, max_f_in_dur = 0.0):   
    # If max_no_transitions = 2, then the audio example can have maximum of 1 transition (ie., max_no_transitions - 1)
    max_no_transitions = 2
    number_of_transitions = np.random.randint(0, max_no_transitions)
    #print("Number of transitions is {}".format(number_of_transitions))
    if number_of_transitions == 0:
        return []
    transition_points = [np.random.uniform(min_segment_length + max_f_out_dur, audio_clip_length - min_segment_length - max_f_in_dur)]
    #print(transition_points)
    
    # Limit number of iterations 
    num_iters = 100000
    
    while len(transition_points) < number_of_transitions:
        point = np.random.uniform(min_segment_length + max_f_out_dur, audio_clip_length - min_segment_length - max_f_in_dur)
        if not check_overlap(transition_points, point, min_segment_length):
            transition_points.append(point)
            num_iters = 100000
        else: 
            num_iters -= 1
            if num_iters < 0:
                #print('Unable to find the required number of transition points. The minimum segment length seems to be high!!')
                
                # Re-calculate the number of transitions and the first transition_point.
                number_of_transitions = np.random.randint(0, max_no_transitions)
                #print("Number of transitions is {}".format(number_of_transitions))
                if number_of_transitions == 0:
                    return []
                transition_points = [np.random.uniform(min_segment_length + max_f_out_dur, audio_clip_length - min_segment_length - max_f_in_dur)]
                #print(transition_points)
                continue                
                raise ValueError('Unable to find the required number of transition points. The minimum segment length seems to be high!!')
    transition_points.sort()
    return transition_points

def create_transition(max_f_out_dur = 1.0, max_f_in_dur = 1.0, max_c_fade_dur = 1.0, max_time_gap = 0.2):
    """
    Returns a dictionary containing parameters of the transition.
    For a normal fade, it is the following.
    {type, f_out_curve, f_out_dur, time_gap, f_in_curve, f_in_dur}.
    For a cross-fade it is the following.
    {type, f_out_curve, f_out_dur, f_in_curve, f_in_dur}.
    """
    transition = {}
    transition['type'] = random.choice(['normal', 'cross-fade'])
    if transition['type'] == "normal":
        transition['f_out_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
        transition['f_out_dur'] = np.random.uniform(0, max_f_out_dur)
        transition['time_gap'] = np.random.uniform(0.0, max_time_gap) # I am setting this to only positive values for the moment.
        transition['f_in_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
        transition['f_in_dur'] = np.random.uniform(0, max_f_in_dur)
        
        if transition['f_out_curve'] == "exp-convex" or transition['f_out_curve'] == "exp-concave" or transition['f_out_curve'] == "s-curve":
            transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

        if transition['f_in_curve'] == "exp-convex" or transition['f_in_curve'] == "exp-concave" or transition['f_in_curve'] == "s-curve":
            transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.
        
        
    elif transition['type'] == "cross-fade":
        transition['f_out_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
        transition['f_out_dur'] = np.random.uniform(0, max_c_fade_dur)
        transition['f_in_curve'] = random.choice(['linear', 'exp-convex', 'exp-concave', 's-curve'])
        transition['f_in_dur'] = np.random.uniform(0, max_c_fade_dur)        

        if transition['f_out_curve'] == "exp-convex" or transition['f_out_curve'] == "exp-concave" or transition['f_out_curve'] == "s-curve":
            transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp and exp-convex transitions.

        if transition['f_in_curve'] == "exp-convex" or transition['f_in_curve'] == "exp-concave" or transition['f_in_curve'] == "s-curve":
            transition['exp_value'] = np.random.uniform(1.5, 3.0) # This is the additional `exp_value` that is calculated only for exp-concave and exp-convex transitions.
    
    #print(transition)
    return transition

def create_transition_list(transition_points):
    """
    This function returns a list of transitions. 
    Each element in the list is tuple (transition, time_stamp)
    """
    transitions_list = []
    for i in range(len(transition_points)):
      if len(transition_points) == 1 or i == len(transition_points) - 1:
        t = create_transition(max_time_gap = transition_points[i] - 1.0)
        transitions_list.append((t, transition_points[i]))

      elif i == 0:
        #print("Central section reached!!!")
        s_len = transition_points[i]
        #print("s_len is {}".format(s_len))
        t = create_transition(max_f_out_dur=min(0.2 * s_len, 1.0), max_f_in_dur=min(0.1 * s_len, 1.0), max_c_fade_dur=min(0.1 * s_len, 1.0))
        transitions_list.append((t, transition_points[i]))        
      else:
        #print("Middle section reached!!!")
        s_len = transition_points[i] - transition_points[i - 1]
        #print("s_len is {}".format(s_len))
        t = create_transition(max_f_out_dur=min(0.2 * s_len, 1.0), max_f_in_dur=min(0.1 * s_len, 1.0), max_c_fade_dur=min(0.1 * s_len, 1.0), max_time_gap = transition_points[i] - 1.0)
        transitions_list.append((t, transition_points[i]))

    return transitions_list

def create_class_list(no_of_classes):
    """
    Create a random list of classes.
    """
    # Create a random list containing music or speech.
    class_list = random.choices(['music', 'speech', 'noise'], weights=[0.4, 0.4, 0.2], k=no_of_classes)
    return class_list

def create_alternating_class_list(no_of_classes):
    """
    Create a random list of classes, but there are no consecutive occurrences of 'music' or 'speech'.
    """
    class_list = [random.choice(['music', 'speech'])]
    #print(class_list)
    
    while len(class_list) < no_of_classes:
        if (class_list[-1] == 'music'):
            class_list.append('speech')
        elif class_list[-1] == 'speech':
            class_list.append('music')
        else:
            print("Encountered unexpected class!!")
            raise ValueError("Encountered unexpected class!!")
            
    return class_list

def create_repeating_class_list(no_of_classes):
    """
    Create a random list of classes, but there are no consecutive occurrences of 'music' or 'speech'.
    """
    class_list = [random.choice(['music', 'speech'])]
    #print(class_list)
    
    while len(class_list) < no_of_classes:
        if (class_list[-1] == 'music'):
            class_list.append('music')
        elif class_list[-1] == 'speech':
            class_list.append('speech')
        else:
            print("Encountered unexpected class!!")
            raise ValueError("Encountered unexpected class!!")
            
    return class_list

def create_noise_containing_class_list(no_of_classes):
    """
    Create a random list of classes, but there are no consecutive occurrences of 'music' or 'speech'.
    """
    class_list = [random.choice(['music', 'speech', 'noise'])]
    #print(class_list)
    
    while len(class_list) < no_of_classes:
        if (class_list[-1] == 'music'):
            class_list.append('noise')
        elif class_list[-1] == 'speech':
            class_list.append('noise')
        elif class_list[-1] == 'noise':
            class_list.append(random.choice(['music', 'speech', 'noise']))

        else:
            print("Encountered unexpected class!!")
            raise ValueError("Encountered unexpected class!!")
            
    return class_list

def create_samples_list(class_list, music_sounds, speech_sounds, noise_sounds):
    """
    Take the class_list as input and randomly pick sound files in the `music_sounds` and `speech_sounds` folder.
    """
    samples_list = []
    for c in class_list:
        if (c == "music"):
            cc = random.choice(music_sounds)
            samples_list.append(cc)
        elif (c == "speech"):
            cc = random.choice(speech_sounds)
            samples_list.append(cc)
        elif (c == "noise"):
            cc = random.choice(noise_sounds)
            samples_list.append(cc)

        else:
            print("Encountered unexpected class!!")
            raise ValueError("Encountered unexpected class!!")
    return samples_list

def get_segment_lengths(transitions_list, audio_clip_length):
    """
    This function takes the list of transitions as input.
    It returns the length of each segment.
    """
    segment_lengths = []
    # Extract the time_stamps from transitions_list
    time_stamps = [j for (i, j) in transitions_list]
    time_stamps = [0] + time_stamps + [audio_clip_length]
    for t in range(len(time_stamps) - 1):
        tt = time_stamps[t + 1] - time_stamps[t]
        segment_lengths.append(tt)    
    return segment_lengths

def get_random_segments(samples_list, segment_lengths, f_buffer = 1.1):
    """
    This function picks random segments from the samples_list. 
    It returns a list of tuples (segment_start, segment_end)
    """
    if len(samples_list) != len(segment_lengths):
        print("The length of samples_list needs to be equal to segment_lengths!!")
        raise ValueError("Data mismatch --- The length of samples_list needs to be equal to segment_lengths!!")
    
    segments = []
    
    for i in range(len(samples_list)):
        #remove_silence_and_resample(samples_list[i])
        
        d, sr = sf.read(samples_list[i])
        sample_length = float(len(d) / sr)
        r = np.random.uniform(f_buffer, sample_length - segment_lengths[i] - f_buffer)
        segments.append((r, r + segment_lengths[i]))
    return segments

def create_template_audio_clip(audio_clip_length, samples_list, segments, sr):
    """
    This stitches all the individual audio segments into one file. It does not include the transitions.
    It returns `synth_audio` which is the synthesised audio file.
    It also returns `synth_audio_seg_samples`, which is a list of tuples (audio clip start, audio clip stop).
    These tuples serve as reference points to perform fade in and fade out operations.
    """
    if (len(samples_list) < 1):
        print("The samples_list argument is invalid!!")
        raise ValueError("The samples_list argument is invalid!!")
    
    synth_audio_seg_samples = [] # This is a list of tuples containing segment boundaries in the synthesised audio.
    
    #print("segments[0]: {}".format(segments[0]))
    
    start = int(segments[0][0] * sr)
    stop = int(np.ceil(segments[0][1] * sr))
    ac_start = 0 # Synthesised audio clip start
    ac_stop = stop - start
    synth_audio_seg_samples.append((ac_start, ac_stop))
    synth_audio, _ = sf.read(samples_list[0], start = start, stop = stop)
    synth_audio = librosa.util.normalize(synth_audio)

    for i in range(1, len(samples_list)):
        #print("segments[{}]: {}".format(i, segments[i]))
        #segment_audio, _ = sf.read(samples_list[i], start = int(segments[i][0] * sr), stop = int(segments[i][1] * sr), samplerate = sr)

        start = int(segments[i][0] * sr)
        stop = int(np.ceil(segments[i][1] * sr))
        ac_start = ac_stop # I have just removed the `+ 1` from the equation.
        ac_stop = ac_start + stop - start
        synth_audio_seg_samples.append((ac_start, ac_stop))
        sa, _ = sf.read(samples_list[i], start = start, stop = stop)
        sa = librosa.util.normalize(sa)
        synth_audio = np.concatenate((synth_audio, sa), axis = 0)
    
    return synth_audio, synth_audio_seg_samples
    
def apply_normal_fade_out(audio, transition, synth_audio_seg_samples, sr):
    """
    This function applies the fade out operation on the `audio` array directly.
    `synth_audio_seg_samples` is a tuple (start, stop), that contains the reference points to perform
    fade out and fade in operations.
    """       
    start, stop = synth_audio_seg_samples
    
    f_out_length_samples =  int(transition[0]['f_out_dur'] * sr)
        
    # `stop_shrunk` refers to the new end point after silencing `time_gap` samples. 
    stop_shrunk = stop - int(transition[0]['time_gap'] * sr)
    
    # Set all the samples in the time gap to be 0.
    audio[stop_shrunk:stop] = 0.0     
    
    #print("stop_shrunk: {}".format(stop_shrunk))
    
    if transition[0]['f_out_curve'] == "linear":     
        audio[stop_shrunk - f_out_length_samples:stop_shrunk] = audio[stop_shrunk - f_out_length_samples:stop_shrunk] * np.linspace(1.0, 0.0, num = f_out_length_samples)

    elif transition[0]['f_out_curve'] == "exp-concave":
        a = np.linspace(1.0, 0.0, num = f_out_length_samples)
        x = transition[0]['exp_value']
        fade_curve = a ** x
        audio[stop_shrunk - f_out_length_samples:stop_shrunk] = audio[stop_shrunk - f_out_length_samples:stop_shrunk] * fade_curve
        
    elif transition[0]['f_out_curve'] == "exp-convex":
        a = np.linspace(0.0, 1.0, num = f_out_length_samples)
        x = transition[0]['exp_value']
        fade_curve = 1 - a ** x
        audio[stop_shrunk - f_out_length_samples:stop_shrunk] = audio[stop_shrunk - f_out_length_samples:stop_shrunk] * fade_curve
       
    elif transition[0]['f_out_curve'] == "s-curve":
        n_1 = int(f_out_length_samples / 2)
        a_1 = np.linspace(0, 1, num = n_1)
        a_2 = np.linspace(0, 1, num = f_out_length_samples - n_1)
        x = transition[0]['exp_value']
        
        convex = 0.5 * (1 - a_1 ** x) + 0.5
        
        concave = 0.5 * (1 - a_2)  ** x
        
        fade_curve = np.concatenate((convex, concave))
        
        audio[stop_shrunk - f_out_length_samples:stop_shrunk] = audio[stop_shrunk - f_out_length_samples:stop_shrunk] * fade_curve

def apply_normal_fade_in(audio, transition, synth_audio_seg_samples, sr):
    """
    This function applies the fade in operation on the `audio` array directly.
    `synth_audio_seg_samples` is a tuple (start, stop), that contains the reference points to perform
    fade out and fade in operations.    
    """
    start, stop = synth_audio_seg_samples
    f_in_length_samples =  int(transition[0]['f_in_dur'] * sr)
    
    if transition[0]['f_in_curve'] == "linear":        
        audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * np.linspace(0.0, 1.0, num = f_in_length_samples)      

    elif transition[0]['f_in_curve'] == "exp-concave":
        a = np.linspace(0.0, 1.0, num = f_in_length_samples)
        x = transition[0]['exp_value']
        fade_curve = a ** x
        audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * fade_curve
        
    elif transition[0]['f_in_curve'] == "exp-convex":
        a = np.linspace(1.0, 0.0, num = f_in_length_samples)
        x = transition[0]['exp_value']
        fade_curve = 1 - a ** x
        audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * fade_curve
        
    elif transition[0]['f_in_curve'] == "s-curve":
        n_1 = int(f_in_length_samples / 2)
        a_1 = np.linspace(0, 1, num = n_1)
        a_2 = np.linspace(0, 1, num = f_in_length_samples - n_1)
        x = transition[0]['exp_value']
        
        concave = 0.5 * a_1 ** x
        
        convex = 0.5 * (1 - (1 - a_2)  ** x) + 0.5
        
        fade_curve = np.concatenate((concave, convex))
        
        audio[start:start + f_in_length_samples] = audio[start:start + f_in_length_samples] * fade_curve     

def apply_cross_fade_out(audio, transition, sample, segment, synth_audio_seg_samples, sr):
    """
    This function applies the fade out portion of the cross-fade operation on the `audio` array directly.
    `sample` refers to the sample that is going to fade out.
    `segments` refers to the segment boundaries in the original sound sample.
    `synth_audio_seg_samples` is a tuple (start, stop), that contains the reference points 
    in the synthesised audio clip to perform fade out and fade in operations.      
    """
    f_out_dur_samples = int(transition[0]['f_out_dur'] * sr)

    if f_out_dur_samples > 0:
      
      start, stop = segment
      start_sample = int(start * sr)
      stop_sample = int(stop * sr)
      
      synth_audio_start, synth_audio_stop = synth_audio_seg_samples
      
      cf_out_audio, _ = sf.read(sample, start = stop_sample, stop = stop_sample + f_out_dur_samples)
      cf_out_audio = librosa.util.normalize(cf_out_audio)
      
      
      if transition[0]['f_out_curve'] == "linear":   
          cf_out_audio = cf_out_audio * np.linspace(1.0, 0.0, num = f_out_dur_samples)
          audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] = audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] + cf_out_audio

      elif transition[0]['f_out_curve'] == "exp-concave":
          a = np.linspace(1.0, 0.0, num = f_out_dur_samples)
          x = transition[0]['exp_value']
          fade_curve = a ** x
          cf_out_audio = cf_out_audio * fade_curve
          audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] = audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] + cf_out_audio
          
      elif transition[0]['f_out_curve'] == "exp-convex":
          a = np.linspace(0.0, 1.0, num = f_out_dur_samples)
          x = transition[0]['exp_value']
          fade_curve = 1 - a ** x        
          cf_out_audio = cf_out_audio * fade_curve
          audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] = audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] + cf_out_audio

      elif transition[0]['f_out_curve'] == "s-curve":
          n_1 = int(f_out_dur_samples / 2)
          a_1 = np.linspace(0, 1, num = n_1)
          a_2 = np.linspace(0, 1, num = f_out_dur_samples - n_1)
          x = transition[0]['exp_value']
          
          convex = 0.5 * (1 - a_1 ** x) + 0.5
          
          concave = 0.5 * (1 - a_2)  ** x
          
          fade_curve = np.concatenate((convex, concave))
          
          cf_out_audio = cf_out_audio * fade_curve
          
          audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] = audio[synth_audio_stop:synth_audio_stop + f_out_dur_samples] + cf_out_audio

def apply_cross_fade_in(audio, transition, sample, segment, synth_audio_seg_samples, sr):
    """
    This function applies the fade in portion of the cross-fade operation on the `audio` array directly.
    `sample` refers to the sample that is going to fade in.
    `segments` refers to the segment boundaries in the original sound sample.
    `synth_audio_seg_samples` is a tuple (start, stop), that contains the reference points 
    in the synthesised audio clip to perform fade out and fade in operations.      
    """    
    f_in_dur_samples = int(transition[0]['f_in_dur'] * sr)

    if f_in_dur_samples > 0:
    
      start, stop = segment
      start_sample = int(start * sr)
      stop_sample = int(stop * sr)    
      
      synth_audio_start, synth_audio_stop = synth_audio_seg_samples
      
      cf_out_audio, _ = sf.read(sample, start = start_sample - f_in_dur_samples, stop = start_sample)
      cf_out_audio = librosa.util.normalize(cf_out_audio)
      

      if transition[0]['f_in_curve'] == "linear":   
          cf_out_audio = cf_out_audio * np.linspace(0.0, 1.0, num = f_in_dur_samples)    
          audio[synth_audio_start - f_in_dur_samples:synth_audio_start] = audio[synth_audio_start - f_in_dur_samples:synth_audio_start] + cf_out_audio

      elif transition[0]['f_in_curve'] == "exp-concave":
          a = np.linspace(0.0, 1.0, num = f_in_dur_samples)
          x = transition[0]['exp_value']
          fade_curve = a ** x    
          cf_out_audio = cf_out_audio * fade_curve    
          audio[synth_audio_start - f_in_dur_samples:synth_audio_start] = audio[synth_audio_start - f_in_dur_samples:synth_audio_start] + cf_out_audio
          
      elif transition[0]['f_in_curve'] == "exp-convex":
          a = np.linspace(1.0, 0.0, num = f_in_dur_samples)
          x = transition[0]['exp_value']
          fade_curve = 1 - a ** x  
          cf_out_audio = cf_out_audio * fade_curve 
          audio[synth_audio_start - f_in_dur_samples:synth_audio_start] = audio[synth_audio_start - f_in_dur_samples:synth_audio_start] + cf_out_audio

      elif transition[0]['f_in_curve'] == "s-curve":
          n_1 = int(f_in_dur_samples / 2)
          a_1 = np.linspace(0, 1, num = n_1)
          a_2 = np.linspace(0, 1, num = f_in_dur_samples - n_1)
          x = transition[0]['exp_value']
          
          concave = 0.5 * a_1 ** x
          
          convex = 0.5 * (1 - (1 - a_2)  ** x) + 0.5
          
          fade_curve = np.concatenate((concave, convex))
          
          cf_out_audio = cf_out_audio * fade_curve 
          
          audio[synth_audio_start - f_in_dur_samples:synth_audio_start] = audio[synth_audio_start - f_in_dur_samples:synth_audio_start] + cf_out_audio      

def create_audio_clip(audio_clip_length, transitions_list, samples_list, segments, sr):
    """
    This function returns the synthesised audio clip after applying transitions.
    """
    if (len(samples_list) < 1):
        print("The samples_list argument is invalid!!")
        raise ValueError("The samples_list argument is invalid!!")
    
    # Add the first audio segment.    
    a, _ = sf.read(samples_list[0])
    a = librosa.util.normalize(a)
    
    synth_audio = np.array([], dtype = np.float32)
    # Add the first transition
    if len(transitions_list) == 0:
        # Trim the audio to the correct length
        l_a = a.shape[0]
        ss = int(audio_clip_length * sr)
        if l_a == ss:
          synth_audio = a[0:ss]
        else:
          l_st = np.random.randint(0, l_a - ss)
          synth_audio = a[l_st:l_st + ss]
       
    elif len(transitions_list) > 0:
        synth_audio, synth_audio_seg_samples = create_template_audio_clip(audio_clip_length, samples_list, segments, sr)
        for i in range(len(transitions_list)):
            if transitions_list[i][0]['type'] == "normal":
                apply_normal_fade_out(synth_audio, transitions_list[i], synth_audio_seg_samples[i], sr)
                apply_normal_fade_in(synth_audio, transitions_list[i], synth_audio_seg_samples[i + 1], sr)
                
            elif transitions_list[i][0]['type'] == "cross-fade":
                apply_cross_fade_out(synth_audio, transitions_list[i], samples_list[i], segments[i], synth_audio_seg_samples[i], sr)
                apply_cross_fade_in(synth_audio, transitions_list[i], samples_list[i + 1], segments[i + 1], synth_audio_seg_samples[i + 1], sr)
        
        # Trim the audio to the correct length
        synth_audio = synth_audio[0:int(audio_clip_length * sr)]
    #print("synth_audio {}".format(synth_audio))
    #sf.write("synth_audio.wav", synth_audio, sr) 
    
    # if random.random() <= 0.3:
    #   synth_noise, tt = get_random_noise(noise_sounds)
    #   gain = get_random_noise_gain(synth_audio)
    #   synth_audio[tt[0]:tt[1]] = gain * synth_noise + synth_audio[tt[0]:tt[1]]

    return synth_audio

def generate_multiclass_labels(audio_clip_length, transitions_list, class_list, sr = 22050.0, res = 220):
  """
  This function generates labels.
  `res` is in samples.
  """

  # This generates the new form of universal labels

  labels = []

  if len(transitions_list) == 0:
    if class_list[0] != "noise":
      labels.append([0.0, 8.0, class_list[0]])

  else:
    if transitions_list[0][0]['type'] == "cross-fade":
      if class_list[0] != "noise":
        labels.append([0.0, transitions_list[0][1] + transitions_list[0][0]['f_out_dur'], class_list[0]])

      if class_list[1] != "noise":
        labels.append([transitions_list[0][1] - transitions_list[0][0]['f_in_dur'], 8.0, class_list[1]])

      
    elif transitions_list[0][0]['type'] == "normal":
      if class_list[0] != "noise":
        labels.append([0.0, transitions_list[0][1] - transitions_list[0][0]['time_gap'], class_list[0]])

      if class_list[1] != "noise":
        labels.append([transitions_list[0][1], 8.0, class_list[1]])
  
  return labels

## Combining the data syntheses

In [None]:
def synthesise_combined_audio_examples(no_of_examples, mel_dir, sr = 22050, audio_clip_length = 8.0, batch_size = 1, offset = 0):

    """
    This function synthesised audio examples and stores them in a directory.
    """           

    count_old = 0
    count_mixed = 0

    no_of_batches = int(np.floor(no_of_examples / batch_size))

    for b in range(no_of_batches):

      l = int(audio_clip_length * sr)

      res_t = 220 / sr
      no_of_labels = int(np.ceil(audio_clip_length / res_t))
      labels = []

      c = random.choice(["old", "mixed"])
      #c = "old"

      if c == "mixed":
        count_mixed += 1
        synth_audio, transition = create_mixed_audio_clip()
        synth_audio = librosa.util.normalize(synth_audio)
        
        labels = generate_mixed_multiclass_labels(transition)

      elif c == "old":
        count_old += 1
        p = create_random_transition_points(audio_clip_length, 1.0) # Create random transition points.
        
        transitions_list = create_transition_list(p) # Create a list of transitions.
        
        class_list = create_class_list(len(p) + 1) # Create a list of alternating classes for the segments.
        
        samples_list = create_samples_list(class_list, music_sounds, speech_sounds, noise_sounds) # Create a list of randomly selected sounf files from the `music_sounds` and `speech_sounds` folder
        
        segment_lengths = get_segment_lengths(transitions_list, audio_clip_length)
        
        random_segments = get_random_segments(samples_list, segment_lengths)
        
        synth_audio = create_audio_clip(audio_clip_length, transitions_list, samples_list, random_segments, 22050)
        synth_audio = librosa.util.normalize(synth_audio)           
        
        labels = generate_multiclass_labels(audio_clip_length, transitions_list, class_list)

      else:
        print("\n\n\n Uncountered unexpected choice between old and mixed!!!! \n\n\n")


          #np.save(n_DbS_label, DbS_label)
      # multi_to_cat(labels)
      mel_id = 'mel-id-' + str(b + 1 + offset)

      pp = mel_dir + '/block-id-' + str(get_block_id(mel_id)) + '/'

      if not os.path.isdir(pp):
        os.mkdir(pp)

      #synth_audio = synth_audio.astype(np.float32)


      M = get_log_melspectrogram(synth_audio)
      np.save(pp + 'mel-id-' + str(b + 1 + offset) + '.npy', M.T)

      with open(pp + 'mel-id-label-' + str(b + 1 + offset) + ".pickle", 'wb') as f:
        pickle.dump(labels, f, pickle.HIGHEST_PROTOCOL)

    return count_old, count_mixed

In [None]:
def synthesise_examples_OF(music_dir, speech_dir, mel_dir, sr = 22050, audio_clip_length = 8.0, batch_size = 1, offset = 0):
  """
  This function synthesised audio examples and stores them in a directory.
  """           
  example_len = int(sr * audio_clip_length)

  batch_size = 1
  no_of_labels = 802

  b = 0

  for s in speech_sounds:
    audio, _ = sf.read(s)
    audio_len = audio.shape[0]
    if audio_len < example_len:
      continue
    else:
      no_of_steps = int(audio_len // example_len)
      i = 0
      for i in range(no_of_steps):
        synth_audio = audio[example_len*i:example_len*(i + 1)]
        batch_audio = np.reshape(synth_audio, (1, example_len))
        synth_audio = librosa.util.normalize(synth_audio)
        labels = np.zeros((no_of_labels, 2), dtype = np.int16)
        labels[:, 0] = 1
          
        mel_id = 'mel-id-' + str(b + 1 + offset)
        M = get_log_melspectrogram(synth_audio)

        pp = mel_dir + '/block-id-' + str(get_block_id(mel_id)) + '/'

        if not os.path.isdir(pp):
          os.mkdir(pp)

        np.save(pp + 'mel-id-' + str(b + 1 + offset) + '.npy', M.T)
        np.save(pp + 'mel-id-label-' + str(b + 1 + offset) + '.npy', labels)
        
        b += 1

      if audio_len >= no_of_steps * example_len:
        synth_audio = audio[audio_len - example_len:audio_len]
        synth_audio = librosa.util.normalize(synth_audio)

        labels = np.zeros((no_of_labels, 2), dtype = np.int16)
        labels[:, 0] = 1

        mel_id = 'mel-id-' + str(b + 1 + offset)
        M = get_log_melspectrogram(synth_audio)

        pp = mel_dir + '/block-id-' + str(get_block_id(mel_id)) + '/'

        if not os.path.isdir(pp):
          os.mkdir(pp)

        np.save(pp + 'mel-id-' + str(b + 1 + offset) + '.npy', M.T)
        np.save(pp + 'mel-id-label-' + str(b + 1 + offset) + '.npy', labels)
        
        b += 1

  for s in music_sounds:
    audio, _ = sf.read(s)
    audio_len = audio.shape[0]
    if audio_len < example_len:
      continue
    else:
      no_of_steps = int(audio_len // example_len)
      i = 0
      for i in range(no_of_steps):
        synth_audio = audio[example_len * i:example_len * (i + 1)]
        synth_audio = librosa.util.normalize(synth_audio)

        labels = np.zeros((no_of_labels, 2))
        labels[:, 1] = 1

          
        mel_id = 'mel-id-' + str(b + 1 + offset)
        M = get_log_melspectrogram(synth_audio)

        pp = mel_dir + '/block-id-' + str(get_block_id(mel_id)) + '/'

        if not os.path.isdir(pp):
          os.mkdir(pp)

        np.save(pp + 'mel-id-' + str(b + 1 + offset) + '.npy', M.T)
        np.save(pp + 'mel-id-label-' + str(b + 1 + offset) + '.npy', labels)
        
        b += 1

      if audio_len >= no_of_steps * example_len:
        synth_audio = audio[audio_len - example_len:audio_len]
        synth_audio = librosa.util.normalize(synth_audio)
        labels = np.zeros((no_of_labels, 2), dtype = np.int16)
        batch_labels[:, 1] = 1

          
        mel_id = 'mel-id-' + str(b + 1 + offset)
        labels = get_log_melspectrogram(synth_audio)

        pp = mel_dir + '/block-id-' + str(get_block_id(mel_id)) + '/'

        if not os.path.isdir(pp):
          os.mkdir(pp)

        np.save(pp + 'mel-id-' + str(b + 1 + offset) + '.npy', M.T)
        np.save(pp + 'mel-id-label-' + str(b + 1 + offset) + '.npy', labels)
        
        b += 1

In [None]:

speech_sounds = speech_list
music_sounds = music_list
noise_sounds = noise_list
print(len(speech_sounds))
print(len(music_sounds))
print(len(noise_sounds))

# Location of the directory to store the audio examples.

mel_dir = "/content/Mel Files"

In [None]:
synth_dir = "/content/drive/My Drive/Data Synthesis"
try: 
  os.makedirs(synth_dir, exist_ok = True) 
  print("Directory '%s' created successfully" %synth_dir) 
except OSError as error: 
    print("Directory '%s' can not be created") 

In [None]:
import pyloudnorm as pyln

In [None]:
import os
import shutil
from zipfile import ZipFile

In [None]:
"""
Split the data synthesis into parts of 5120 examples. 
In the below code, i ranges from 0 to 8, and thus creating 5120 * 8 = 40960 examples.
If you are synthesising the val set, a range from 0 to 1 might be appropriate.
"""
expt_seed = 8

def get_random_loudness_gain(speech_data, music_data, rate = 22050):
  meter = pyln.Meter(rate)
  speech_loudness = meter.integrated_loudness(speech_data)
  music_loudness = meter.integrated_loudness(music_data)
  # Below mentions the range of Loudness Difference between speech and background music
  random_loudness = np.random.uniform(speech_loudness - 33.0, speech_loudness - 4.0)
  delta_loudness = random_loudness - music_loudness
  gain = np.power(10.0, delta_loudness/20.0)

  return gain

random.seed(expt_seed)
np.random.seed(expt_seed)

os.mkdir(synth_dir + "/Seed" + str(expt_seed))

for i in range(0, 1):
  zip_file_name = synth_dir + "/Seed" + str(expt_seed) + "/d" +str(i + 1) + ".zip"

  os.mkdir('/content/Mel Files')
  count_old, count_mixed = synthesise_combined_audio_examples(500, mel_dir, offset = 5120 * i)
  mel_files = glob.glob('/content/Mel Files/**/*.*', recursive=True)
  print(mel_files)

  with ZipFile(zip_file_name, 'w') as my_zip:
    for f in mel_files:
      my_zip.write(f)

  shutil.rmtree('/content/Mel Files')