<a href="https://colab.research.google.com/github/satvik-venkatesh/train-synth-audio-seg/blob/main/mk-prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Please note that the pre-trained models are for non-commercial use only. It is under the [Create Commons Attribution-NonCommercial-ShareAlike-3.0 Unported (CC BY-NC-SA 3.0) license](https://creativecommons.org/licenses/by-nc-sa/3.0/).


In [None]:
!pip install numba==0.48

In [None]:
!pip install soundfile==0.10.3.post1
!sudo apt-get install sox
!pip install sed_eval
!pip install librosa==0.7.2

In [None]:
!git clone https://github.com/satvik-venkatesh/train-synth-audio-seg.git

In [None]:
import soundfile as sf
import numpy as np
from shutil import copyfile
import os
import librosa
import tensorflow as tf
import math
import sed_eval
import dcase_util
import json
import pickle
import glob
from subprocess import Popen, PIPE

# Functions

In [None]:
def remove_silence(sound):
  """
  The procedure to remove silence from audio tracks is copied from the Lemaire et al. (2019) github repository.
  """
  temp_file = sound.replace('.wav', '_t.wav').replace('.WAV', '_t.WAV')
  command = "sox " + sound + " " + temp_file + " silence -l 1 0.1 1% -1 0.1 1%"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()
  copyfile(temp_file, sound)
  os.remove(temp_file)


def get_sound_segments(sound_list, segment_size = 1.0, sampling_rate = 22050.0):
  # Load the first sound file.
  #remove_silence(sound_list[0])
  in_signal, in_sr = sf.read(sound_list[0], dtype='float32')
  print("in_signal.dtype is {}".format(in_signal.dtype))

  # Resample the audio file.
  in_signal_22k = librosa.resample(in_signal, orig_sr=in_sr, target_sr=sampling_rate)
  in_signal = np.copy(in_signal_22k)

  # Calculate segment size in samples and no of segments
  ss = int (sampling_rate * segment_size)
  n = int(in_signal.shape[0] / ss)

  seg0 = np.reshape(in_signal[0:n * ss], (n, -1))

  seg = np.copy(seg0)
  # Extract embeddings from remaining files.
  for s in sound_list[1:]:
    #remove_silence(s)
    in_signal, in_sr = sf.read(s)
    
    # Resample the audio file.
    in_signal_22k = librosa.resample(in_signal, orig_sr=in_sr, target_sr=sampling_rate)
    in_signal = np.copy(in_signal_22k)
    
    # Calculate segment size in samples and no of segments
    ss = int (sampling_rate * segment_size)
    n = int(in_signal.shape[0] / ss)

    seg_s = np.reshape(in_signal[0:n * ss], (n, -1))

    seg = np.concatenate((seg, seg_s), axis = 0) #em concatenates all the values of embeddings.

  return seg 

"""
A function to extract mel spectrograms.
"""
def extract_mel_spec(segments, sr = 22050, hop_length = 220, n_fft = 1024): 
  # `segments_mel_spec` contains the extracted mel spectrograms.
  

  # Calculate mel spectrogram of first segment.
  s0 = segments[0, :]
  mel_spec0 = librosa.feature.melspectrogram(y=s0, sr=sr, hop_length=hop_length, n_fft=n_fft, fmin=64, fmax=8000, n_mels=80)
  print('mel_spec0.dtype is {}'.format(mel_spec0.dtype))
  # D = librosa.stft(s0, hop_length=hop_length, n_fft=n_fft)
  # magnitude, phase = librosa.magphase(D)
  # print("magnitude.shape is {}".format(magnitude.shape ))
  # print("phase.shape is {}".format(phase.shape ))
  # ang_phase = np.angle(phase)
  # print("ang_phase.shape is {}".format(ang_phase.shape))
  # mel_spec0 = np.concatenate((mel_spec0, ang_phase), axis = 0)

  (m, _) = segments.shape
  (n, o) = mel_spec0.shape
  segments_mel_spec = np.zeros((m, n, o), dtype='float32')
  segments_mel_spec[0, :, :] = mel_spec0

  for i in range(1, m):
    s = segments[i, :]
    mel_spec = librosa.feature.melspectrogram(y=s, sr=sr, hop_length=hop_length, n_fft=n_fft, n_mels=80)
    # D = librosa.stft(s, hop_length=hop_length, n_fft=n_fft)
    # magnitude, phase = librosa.magphase(D)
    # ang_phase = np.angle(phase)
    # mel_spec = np.concatenate((mel_spec, ang_phase), axis = 0)
    segments_mel_spec[i, :, :] = mel_spec

  return segments_mel_spec



"""
This function was copied from Lemaire et al. 2019. 
"""

def smooth_output(output, min_speech=1.3, min_music=3.4, max_silence_speech=0.4, max_silence_music=0.6):
    duration_frame = 220 / 22050
    n_frame = output.shape[1]

    start_music = -1000
    start_speech = -1000

    for i in range(n_frame):
        if output[0, i] == 1:
            if i - start_speech > 1:
                if (i - start_speech) * duration_frame <= max_silence_speech:
                    output[0, start_speech:i] = 1

            start_speech = i

        if output[1, i] == 1:
            if i - start_music > 1:
                if (i - start_music) * duration_frame <= max_silence_music:
                    output[1, start_music:i] = 1

            start_music = i

    start_music = -1000
    start_speech = -1000

    for i in range(n_frame):
        if i != n_frame - 1:
            if output[0, i] == 0:
                if i - start_speech > 1:
                    if (i - start_speech) * duration_frame <= min_speech:
                        output[0, start_speech:i] = 0

                start_speech = i

            if output[1, i] == 0:
                if i - start_music > 1:
                    if (i - start_music) * duration_frame <= min_music:
                        output[1, start_music:i] = 0

                start_music = i
        else:
            if i - start_speech > 1:
                if (i - start_speech) * duration_frame <= min_speech:
                    output[0, start_speech:i + 1] = 0

            if i - start_music > 1:
                if (i - start_music) * duration_frame <= min_music:
                    output[1, start_music:i + 1] = 0

    return output

"""
This function converts the predictions made by the neural network into a format that is understood by sed_eval.
"""

def preds_to_se(p, audio_clip_length = 8.0):
  start_speech = -100
  start_music = -100
  stop_speech = -100
  stop_music = -100

  audio_events = []

  n_frames = p.shape[0]

  if p[0, 0] == 1:
    start_speech = 0
  
  if p[0, 1] == 1:
    start_music = 0

  for i in range(n_frames - 1):
    if p[i, 0] == 0 and p[i + 1, 0] == 1:
      start_speech = i + 1

    elif p[i, 0] == 1 and p[i + 1, 0] == 0:
      stop_speech = i
      start_time = frames_to_time(start_speech)
      stop_time = frames_to_time(stop_speech)
      audio_events.append((start_time, stop_time, "speech"))
      start_speech = -100
      stop_speech = -100

    if p[i, 1] == 0 and p[i + 1, 1] == 1:
      start_music = i + 1
    elif p[i, 1] == 1 and p[i + 1, 1] == 0:
      stop_music = i
      start_time = frames_to_time(start_music)
      stop_time = frames_to_time(stop_music)      
      audio_events.append((start_time, stop_time, "music"))
      start_music = -100
      stop_music = -100

  if start_speech != -100:
    start_time = frames_to_time(start_speech)
    stop_time = audio_clip_length
    audio_events.append((start_time, stop_time, "speech"))
    start_speech = -100
    stop_speech = -100

  if start_music != -100:
    start_time = frames_to_time(start_music)
    stop_time = audio_clip_length
    audio_events.append((start_time, stop_time, "music"))
    start_music = -100
    stop_music = -100

  audio_events.sort(key = lambda x: x[0]) 
  return audio_events

def frames_to_time(f, sr = 22050.0, hop_size = 220):
  return f * hop_size / sr

def get_log_melspectrogram(audio, sr = 22050, hop_length = 220, n_fft = 1024, n_mels = 80, fmin = 64, fmax = 8000):
    """Return the log-scaled Mel bands of an audio signal."""
    bands = librosa.feature.melspectrogram(
        y=audio, sr=sr, hop_length=hop_length, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, dtype=np.float32)
    return librosa.core.power_to_db(bands, amin=1e-7)

def mk_preds_ens(audio_path, hop_size = 6.0, discard = 1.0, win_length = 8.0, sampling_rate = 22050):
  in_signal, in_sr = sf.read(audio_path)

  # Convert to mono if needed.
  if (in_signal.ndim > 1):
    in_signal_mono = librosa.to_mono(in_signal.T)
    in_signal = np.copy(in_signal_mono)

  # Resample the audio file.
  in_signal_22k = librosa.resample(in_signal, orig_sr=in_sr, target_sr=sampling_rate)
  in_signal = np.copy(in_signal_22k)

  # Pad the input signal if it is shorter than 8 s.
  if in_signal.shape[0] < int(8.0 * sampling_rate):
  	pad_signal = np.zeros((int(8.0 * sampling_rate)))
  	pad_signal[:in_signal.shape[0]] = in_signal
  	in_signal = np.copy(pad_signal)


  audio_clip_length_samples = in_signal.shape[0]
  # print('audio_clip_length_samples is {}'.format(audio_clip_length_samples))

  #hop_size_samples = int(hop_size * sampling_rate)
  hop_size_samples = 220 * 602 - 1

  #win_length_samples = int(win_length * sampling_rate)
  win_length_samples = 220 * 802 - 1

  n_preds = int(math.ceil((audio_clip_length_samples - win_length_samples) / hop_size_samples)) + 1

  #print('n_preds is {}'.format(n_preds))

  in_signal_pad = np.zeros((n_preds * hop_size_samples + 200 * 220))

  #print('in_signal_pad.shape is {}'.format(in_signal_pad.shape))

  in_signal_pad[0:audio_clip_length_samples] = in_signal

  preds = np.zeros((n_preds, 802, 2))
  mss_in = np.zeros((n_preds, 802, 80))

  for i in range(n_preds):
    seg = in_signal_pad[i * hop_size_samples:(i * hop_size_samples) + win_length_samples]
    #print('seg.shape is {}'.format(seg.shape))
    seg = librosa.util.normalize(seg)

    mss = get_log_melspectrogram(seg)
    M = mss.T
    mss_in[i, :, :] = M

  yhats = [model.predict(mss_in) for model in models]
  yhats = np.array(yhats)
  # sum across ensembles
  summed = np.mean(yhats, axis=0)

  preds = (summed >= (0.5, 0.5)).astype(np.float)
  #p = cat_to_multi(p)

  # preds[i, :, :] = p[0]

  #discard_frames = 100
  #oa_preds = np.zeros((, 128)) # overall predictions

  preds_mid = np.copy(preds[1:-1, 100:702, :])

  #print("preds_mid.shape is {}".format(preds_mid.shape))

  preds_mid_2 = preds_mid.reshape(-1, 2)

  if preds.shape[0] > 1:
    oa_preds = preds[0, 0:702, :] # oa stands for overall predictions

  else:
    oa_preds = preds[0, 0:802, :] # oa stands for overall predictions

  oa_preds = np.concatenate((oa_preds, preds_mid_2), axis = 0)

  if preds.shape[0] > 1:
    oa_preds = np.concatenate((oa_preds, preds[-1, 100:, :]), axis = 0)

  return oa_preds

# Custom Metrics

In [None]:
"""
Custom Metrics
"""
class SpeechF1(tf.keras.metrics.Metric):

  def __init__(self, name='speech_f1', **kwargs):
    super(SpeechF1, self).__init__(name=name, **kwargs)
    self.tp = self.add_weight(name='true_positive', initializer='zeros')
    self.fp = self.add_weight(name='false_positive', initializer='zeros')
    self.tn = self.add_weight(name='true_negative', initializer='zeros')
    self.fn = self.add_weight(name='false_negative', initializer='zeros')

  def update_state(self, y_true, y_pred, sample_weight=None):

    threshold = tf.constant([0.5])

    binary_true = y_true[:, :, 0]
    binary_pred = y_pred[:, :, 0]

    binary_true = tf.greater_equal(binary_true, threshold)
    binary_pred = tf.greater_equal(binary_pred, threshold)

    tp = tf.cast(tf.logical_and(tf.equal(binary_true, True), tf.equal(binary_pred, True)), dtype = np.float32)
    fp = tf.cast(tf.logical_and(tf.equal(binary_true, False), tf.equal(binary_pred, True)), dtype = np.float32)
    tn = tf.cast(tf.logical_and(tf.equal(binary_true, False), tf.equal(binary_pred, False)), dtype = np.float32)
    fn = tf.cast(tf.logical_and(tf.equal(binary_true, True), tf.equal(binary_pred, False)), dtype = np.float32)

    self.tp.assign_add(tf.reduce_sum(tp, axis = None))
    self.fp.assign_add(tf.reduce_sum(fp, axis = None))
    self.tn.assign_add(tf.reduce_sum(tn, axis = None))
    self.fn.assign_add(tf.reduce_sum(fn, axis = None))

  def result(self):
    binary_f1 = self.tp / (self.tp +  0.5 * (self.fp + self.fn))
    return binary_f1

  def reset_states(self):
    self.tp.assign(0)
    self.fp.assign(0)
    self.tn.assign(0)
    self.fn.assign(0)

class MusicF1(tf.keras.metrics.Metric):

  def __init__(self, name='music_f1', **kwargs):
    super(MusicF1, self).__init__(name=name, **kwargs)
    self.tp = self.add_weight(name='true_positive', initializer='zeros')
    self.fp = self.add_weight(name='false_positive', initializer='zeros')
    self.tn = self.add_weight(name='true_negative', initializer='zeros')
    self.fn = self.add_weight(name='false_negative', initializer='zeros')

  def update_state(self, y_true, y_pred, sample_weight=None):

    threshold = tf.constant([0.5])

    binary_true = y_true[:, :, 1]
    binary_pred = y_pred[:, :, 1]

    binary_true = tf.greater_equal(binary_true, threshold)
    binary_pred = tf.greater_equal(binary_pred, threshold)

    tp = tf.cast(tf.logical_and(tf.equal(binary_true, True), tf.equal(binary_pred, True)), dtype = np.float32)
    fp = tf.cast(tf.logical_and(tf.equal(binary_true, False), tf.equal(binary_pred, True)), dtype = np.float32)
    tn = tf.cast(tf.logical_and(tf.equal(binary_true, False), tf.equal(binary_pred, False)), dtype = np.float32)
    fn = tf.cast(tf.logical_and(tf.equal(binary_true, True), tf.equal(binary_pred, False)), dtype = np.float32)

    self.tp.assign_add(tf.reduce_sum(tp, axis = None))
    self.fp.assign_add(tf.reduce_sum(fp, axis = None))
    self.tn.assign_add(tf.reduce_sum(tn, axis = None))
    self.fn.assign_add(tf.reduce_sum(fn, axis = None))

  def result(self):
    binary_f1 = self.tp / (self.tp +  0.5 * (self.fp + self.fn))
    return binary_f1

  def reset_states(self):
    self.tp.assign(0)
    self.fp.assign(0)
    self.tn.assign(0)
    self.fn.assign(0)

# Make predictions

In [None]:
"""
'test_audio_dir' is the directory of audio files
"""
test_audio_dir = "/content/train-synth-audio-seg/Synthetic Radio Examples"
test_audio = glob.glob(test_audio_dir + "/*.wav")

In [None]:
"""
This code block performs ensemble prediction for the audio files.
"""

expt_seeds = [13, 29, 77, 8, 136]

"""
There are 4 possible train_set_types: "ARE", "ARE-RRE", "SSE-RRE", "SSE"

ARE: Artificial Radio Examples
ARE-RRE: Artificial Radio Examples + Real-world Radio Examples
SSE-RRE: Sound Segment Examples + Real-world Radio Examples
SSE: Sound Segment Examples

ARE-RRE has best performance.

ARE-RRE > ARE > SSE-RRE > SSE

Please refer to the research paper for more details.
"""

train_set_type = "ARE-RRE"

models = []

for expt_seed in expt_seeds:

  m = tf.keras.models.load_model("/content/train-synth-audio-seg/models/Seed" + 
                                str(expt_seed) +"/"+ train_set_type + "/model-best.h5",
                                    custom_objects={'SpeechF1':SpeechF1(), 'MusicF1':MusicF1()})
  models.append(m)


print(models)


for tt in test_audio:
  ss, _ = sf.read(tt)
  oop = mk_preds_ens(tt)

  #print(oop.shape)
  p_smooth = smooth_output(oop.T, min_speech = 0.8, min_music = 3.4, max_silence_speech = 0.8, max_silence_music = 0.8)
  p_smooth = p_smooth.T
  see = preds_to_se(p_smooth, audio_clip_length=ss.shape[0]/22050.0)
  #print(see)
  n_label = tt.replace(".wav", "-se-prediction.txt")

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))


# Code to evaluate predictions

In [None]:
"""
Code for second round of using sed_eval starts here.. This simply tests on one of the muspeak datasets
"""

eval_path = os.path.join(test_audio_dir, "Eval")

try: 
  os.makedirs(eval_path, exist_ok = True) 
  print("Directory '%s' created successfully" %eval_path) 
except OSError as error: 
    print("Directory '%s' already exists")


file_list = [
    {
    'reference_file': tt.replace(".wav", "-label.txt"),
    'estimated_file': tt.replace(".wav", "-se-prediction.txt")
    }
    for tt in test_audio
]

data = []

# Get used event labels
all_data = dcase_util.containers.MetaDataContainer()
for file_pair in file_list:
    reference_event_list = sed_eval.io.load_event_list(
        filename=file_pair['reference_file']
    )
    estimated_event_list = sed_eval.io.load_event_list(
        filename=file_pair['estimated_file']
    )

    data.append({'reference_event_list': reference_event_list,
                'estimated_event_list': estimated_event_list})

    all_data += reference_event_list

event_labels = all_data.unique_event_labels

# Start evaluating

# Create metrics classes, define parameters
segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=event_labels,
    time_resolution=0.010
)

event_based_metrics = sed_eval.sound_event.EventBasedMetrics(
    event_label_list=event_labels,
    t_collar=0.500
)

# Go through files
for file_pair in data:
    segment_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

    event_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

# Get only certain metrics
overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
print("Accuracy:", overall_segment_based_metrics['accuracy']['accuracy'])

# Or print all metrics as reports

model_basename = train_set_type
seg_eval_basename = "seg eval " + model_basename.replace(".h5", "") + ".txt"
ev_eval_basename = "event eval " + model_basename.replace(".h5", "") + ".txt"
with open(os.path.join(eval_path, seg_eval_basename), mode='w') as fp:
  fp.write(str(segment_based_metrics))

with open(eval_path + "/seg eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(segment_based_metrics, f, pickle.HIGHEST_PROTOCOL)

with open(os.path.join(eval_path, ev_eval_basename), mode = 'w') as fp:
  fp.write(str(event_based_metrics))

with open(eval_path + "/event eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(event_based_metrics, f, pickle.HIGHEST_PROTOCOL)   


In [None]:
test_audio