<a href="https://colab.research.google.com/github/satvik-venkatesh/you-only-hear-once/blob/main/YOHO-TUT-Sound-Events-2017.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from zipfile import ZipFile
import glob

# Download development dataset from [zenodo](https://zenodo.org/record/400516#.YTjxSJ1KhPY)

In [None]:
!mkdir "/content/DevelopmentZipped/"

In [None]:
!wget https://zenodo.org/record/814831/files/TUT-sound-events-2017-development.audio.1.zip?download=1 -O /content/DevelopmentZipped/TUT-sound-events-2017-development.audio.1.zip

In [None]:
!wget https://zenodo.org/record/814831/files/TUT-sound-events-2017-development.audio.2.zip?download=1 -O /content/DevelopmentZipped/TUT-sound-events-2017-development.audio.2.zip

In [None]:
!wget https://zenodo.org/record/814831/files/TUT-sound-events-2017-development.doc.zip?download=1 -O /content/DevelopmentZipped/TUT-sound-events-2017-development.doc.zip

In [None]:
!wget https://zenodo.org/record/814831/files/TUT-sound-events-2017-development.meta.zip?download=1 -O /content/DevelopmentZipped/TUT-sound-events-2017-development.meta.zip

In [None]:
g = glob.glob("/content/DevelopmentZipped/*.zip")

for gg in g:
  zip_name = gg
  with ZipFile(zip_name, 'r') as zip:
    zip.extractall('/content/development')

# Download [evaluation dataset](https://zenodo.org/record/1040179#.YTj1dJ1KhPY) from zenodo

In [None]:
!mkdir "/content/EvaluationZipped/"

In [None]:
!wget https://zenodo.org/record/1040179/files/TUT-sound-events-2017-evaluation.audio.zip?download=1 -O /content/EvaluationZipped/TUT-sound-events-2017-evaluation.audio.zip

In [None]:
!wget https://zenodo.org/record/1040179/files/TUT-sound-events-2017-evaluation.doc.zip?download=1 -O /content/EvaluationZipped/TUT-sound-events-2017-evaluation.doc.zip

In [None]:
!wget https://zenodo.org/record/1040179/files/TUT-sound-events-2017-evaluation.meta.zip?download=1 -O /content/EvaluationZipped/TUT-sound-events-2017-evaluation.meta.zip

In [None]:
g = glob.glob("/content/EvaluationZipped/*.zip")

for gg in g:
  zip_name = gg
  with ZipFile(zip_name, 'r') as zip:
    zip.extractall('/content/evaluation')

# Annotations

In [None]:
import csv

In [None]:
def read_annotation(filename):
    events = []
    with open(filename, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for row in spamreader:
            events.append(row)
    return events

In [None]:
events = read_annotation("/content/development/TUT-sound-events-2017-development/meta/street/a001.ann")

In [None]:
audio_files = glob.glob("/content/development/TUT-sound-events-2017-development/audio/street/*.wav")

In [None]:
len(audio_files)

In [None]:
import soundfile as sf

In [None]:
!sudo apt-get install sox

In [None]:
from subprocess import Popen, PIPE
from os.path import dirname
import os

In [None]:
os.makedirs(dirname(audio_files[0]).replace("audio", "audio-mono"))

In [None]:
len(audio_files)

In [None]:
for sound in audio_files:
  temp_file = sound.replace("audio", "audio-mono")
  command = command = "sox " + sound + " " + temp_file + " channels 1"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()

In [None]:
audio_files_mono = glob.glob("/content/development/TUT-sound-events-2017-development/audio-mono/street/*.wav")

In [None]:
len(audio_files_mono)

# Split into folds

In [None]:
fold1_train_events = read_annotation("/content/development/TUT-sound-events-2017-development/evaluation_setup/street_fold1_train.txt")
fold1_val_events = read_annotation("/content/development/TUT-sound-events-2017-development/evaluation_setup/street_fold1_evaluate.txt")

In [None]:
fold1_train_files = set([f[0].replace("audio", "/content/development/TUT-sound-events-2017-development/audio-mono") for f in fold1_train_events])
fold1_val_files = set([f[0].replace("audio", "/content/development/TUT-sound-events-2017-development/audio-mono") for f in fold1_val_events])

In [None]:
fold1_val_files

In [None]:
import math
import numpy as np

In [None]:
def construct_examples(audio_path, win_len = 2.56, hop_len = 1.0, sr = 44100.0):

  win_len_t = win_len
  hop_len_t = hop_len

  win_len = int(sr*win_len)
  hop_len = int(sr*hop_len)

  a, sr = sf.read(audio_path)

  if a.shape[0] < win_len:
    a_padded = np.zeros((win_len, ))
    a_padded[0:a.shape[0]] = a  

  else:
    no_of_hops = math.ceil((a.shape[0] - win_len) / hop_len)
    a_padded = np.zeros((int(win_len + hop_len*no_of_hops), ))
    a_padded[0:a.shape[0]] = a  

  a_ex = [a_padded[i - win_len : i] for i in range(win_len, a_padded.shape[0]+1, hop_len)]
  win_ranges = [((i - win_len)/sr, i/sr) for i in range(win_len, a_padded.shape[0]+1, hop_len)]

  return a_ex, win_ranges

In [None]:
def construct_labels(annotation_path, win_start, win_end, win_len):
  events = read_annotation(annotation_path)

  ann = [[float(e[2]), float(e[3]), e[4]] for e in events]

  curr_ann = []

  for a in ann:
    if a[1] > win_start and a[0] <= win_end: 
    # if a[0] >= win_start and a[0] < win_end:
      curr_start = max(a[0] - win_start, 0.0)
      curr_end = min(a[1] - win_start, win_len)
      curr_ann.append([curr_start, curr_end, a[2]])    

  class_set = set([c[2] for c in curr_ann])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in curr_ann:
    class_wise_events[c[2]].append(c)
    
  max_event_silence = 0.0
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events

In [None]:
class_dict = {'brakes squeaking': 0,
              'car': 1,
              'children': 2,
              'large vehicle': 3,
              'people speaking': 4,
              'people walking': 5}

In [None]:
def to_seg_by_class(events, class_dict, hop_len = 441, n_frames = 257, sr=44100):
  # events = smoothe_events(events)
  labels = np.zeros((n_frames, 6), dtype=np.float32)

  for e in events:
    t1 = float(e[0])
    t1 = int(t1 / hop_len * sr)
    t2 = float(e[1])
    t2 = int(t2 / hop_len * sr)

    labels[t1:t2, class_dict[e[2]]] = 1    
  
  return labels 

In [None]:
def get_universal_labels(events, class_dict, ex_length = 10.0, no_of_div = 32):
  win_length = ex_length/no_of_div
  labels = np.zeros((no_of_div, len(class_dict.keys()) * 3))
  
  for e in events:

    start_time = float(e[0])
    stop_time = float(e[1])

    start_bin = int(start_time // win_length)
    stop_bin = int(stop_time // win_length)

    start_time_2 = start_time - start_bin * win_length
    stop_time_2 = stop_time - stop_bin * win_length

    n_bins = stop_bin - start_bin

    if n_bins == 0:
      labels[start_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, start_time_2, stop_time_2]    

    elif n_bins == 1:
      labels[start_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, start_time_2, win_length]

      if stop_time_2 > 0.0:
        labels[stop_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, 0.0, stop_time_2]

    elif n_bins > 1:
      labels[start_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, start_time_2, win_length]

      for i in range(1, n_bins):
        labels[start_bin + i, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, 0.0, win_length]

      if stop_time_2 > 0.0:
        labels[stop_bin, class_dict[e[2]] * 3:class_dict[e[2]] * 3 + 3] = [1, 0.0, stop_time_2]

  # labels[:, [1, 2, 4, 5]] /= win_length

  for i in range(len(labels)):
    for j in range(len(labels[i])):
      if j % 3 != 0:
        labels[i][j] /= win_length

  return labels

In [None]:
import shutil

In [None]:
shutil.rmtree("/content/train-data", ignore_errors=True)
os.mkdir("/content/train-data")

In [None]:
"""
Construct train set
"""

win_len = 2.56
hop_len = 1.96
a_ex_train = []
a_labels_train = []

# win_size = 10.0
# win_start = 16.0
# win_end = win_start + win_size

for i, audio in enumerate(fold1_train_files):
  a, win_ranges = construct_examples(audio,win_len=win_len, hop_len=hop_len)
  a_ex_train += a

  for w in win_ranges:
    labels_t = construct_labels(audio.replace(".wav", ".ann").replace("audio-mono", "meta"), w[0], w[1], win_len=win_len)
    ll = get_universal_labels(labels_t, class_dict, ex_length=win_len, no_of_div = 9)
    # ll = to_seg_by_class(labels_t, class_dict)
    a_labels_train.append(ll)

    # a_labels_train.append(to_seg_by_class(labels_t, class_dict))

In [None]:
import librosa

In [None]:
def get_log_melspectrogram(audio, sr = 44100, hop_length = 441, win_length = 1764, n_fft = 2048, n_mels = 40, fmin = 0, fmax = 22050):
    """Return the log-scaled Mel bands of an audio signal."""
    audio_2 = librosa.util.normalize(audio)
    bands = librosa.feature.melspectrogram(
        y=audio_2, sr=sr, hop_length=hop_length, win_length = win_length, n_fft=n_fft, n_mels=n_mels)
    return librosa.core.power_to_db(bands)

In [None]:
# a, sr = sf.read(audio_files_mono[0])
M = get_log_melspectrogram(a_ex_train[0])

In [None]:
M.shape

In [None]:
for i, a in enumerate(a_ex_train):
  M = get_log_melspectrogram(a).T
  np.save("/content/train-data/ex-" + str(i) + ".npy", M)

In [None]:
for i, a in enumerate(a_labels_train):
  np.save("/content/train-data/label-" + str(i) + ".npy", a)

In [None]:
# !rm -rf "/content/val-data"

In [None]:
shutil.rmtree("/content/val-data", ignore_errors=True)
os.mkdir("/content/val-data")

In [None]:
"""
Construct val set
"""

win_len = 2.56
hop_len = 1.96
a_ex_val = []
a_labels_val = []

# win_size = 10.0
# win_start = 16.0
# win_end = win_start + win_size

for i, audio in enumerate(fold1_val_files):
  a, win_ranges = construct_examples(audio,win_len=win_len, hop_len=hop_len)
  a_ex_val += a

  for w in win_ranges:
    labels_t = construct_labels(audio.replace(".wav", ".ann").replace("audio-mono", "meta"), w[0], w[1], win_len=win_len)
    ll = get_universal_labels(labels_t, class_dict, ex_length=win_len, no_of_div = 9)
    # ll = to_seg_by_class(labels_t, class_dict)
    a_labels_val.append(ll)

    # a_labels_train.append(to_seg_by_class(labels_t, class_dict))

In [None]:
win_ranges

In [None]:
for i, a in enumerate(a_ex_val):
  M = get_log_melspectrogram(a).T
  np.save("/content/val-data/ex-" + str(i) + ".npy", M)

In [None]:
for i, a in enumerate(a_labels_val):
  np.save("/content/val-data/label-" + str(i) + ".npy", a)

In [None]:
import re

def tryint(s):
    try:
        return int(s)
    except ValueError:
        return s
    
def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

def sort_nicely(l):
    """ Sort the given list in the way that humans expect.
    """
    l.sort(key=alphanum_key)

In [None]:
import glob
import random
"""
Load the individual numpy arrays into partition
"""
data = glob.glob("/content/train-data/ex-*.npy") # + glob.glob("/content/train data/MuSpeak/content/Mel Files/**/mel-id-[0-9]*.npy", recursive=True) 
#data = glob.glob("/content/train data/MuSpeak/content/Mel Files/**/mel-id-[0-9]*.npy", recursive=True) 
sort_nicely(data)

labels = glob.glob("/content/train-data/label-*.npy") #+ glob.glob("/content/train data/MuSpeak/content/Mel Files/**/mel-id-label-[0-9]*.npy", recursive=True)
#labels = glob.glob("/content/train data/MuSpeak/content/Mel Files/**/mel-id-label-[0-9]*.npy", recursive=True)
sort_nicely(labels)

train_examples = [(data[i], labels[i]) for i in range(len(data))]

random.seed(4)
random.shuffle(train_examples)
#print(train_examples[0])

# m = len(train_examples)
# m_validation = 1024
# m_test = 512
# m_train = 40960

# partition = {}
# partition['train'] = train_examples[0:m_train]
# partition['validation'] = examples[m_train:m_train + m_validation]
# partition['test'] = examples[m_train + m_validation:m]

In [None]:
"""
Creating the train partition.
"""
partition = {}
partition['train'] = train_examples

random.shuffle(partition['train'])

In [None]:
"""
This loads data for the validation set.
"""
import glob
import random

data = glob.glob("/content/val-data/ex-*.npy")
sort_nicely(data)

labels = glob.glob("/content/val-data/label-*.npy")
sort_nicely(labels)

validation_examples = [(data[i], labels[i]) for i in range(len(data))]

random.seed(4)
random.shuffle(validation_examples)
print(validation_examples[0])

# m = len(test_examples)
# m_validation = 1024
# m_test = 512
# m_train = m - m_validation - m_test

partition['validation'] = validation_examples

In [None]:
!git clone https://github.com/DemisEom/SpecAugment.git
!pip install /content/SpecAugment/
!pip install tensorflow-addons

In [None]:
from SpecAugment import spec_augment_tensorflow

In [None]:
import tensorflow as tf
import keras

class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_examples, batch_size=128, epoch_size = 16384, dim=(1, ),
                 n_classes=2, shuffle=True):
        'Initialization'
        print("Constructor called!!!")
        self.dim = dim
        self.batch_size = batch_size
        self.epoch_size = epoch_size
        self.list_examples = list_examples
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        #print("The self.list_examples is {}".format(self.list_examples))
        return int(np.floor(len(self.list_examples) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_examples[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y
        
    def on_epoch_end(self):
      self.indexes = np.arange(len(self.list_examples))
      if self.shuffle == True:
          np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        # 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # # Initialization
        X = np.empty([self.batch_size, 257, 40, 1], dtype=np.float64)
        y = np.empty([self.batch_size, 9, 18], dtype=np.float64)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
          # Store sample

          xx = np.load(ID[0])

          X[i, :, :, 0] = xx

          # Store class
          yy = np.load(ID[1])
          # yy2 = yy[:, [1, 2, 4, 5]]
          y[i, :, :] = yy

        tau = X.shape[1]          
        v = X.shape[2]

        warped_frequency_spectrogram = spec_augment_tensorflow.frequency_masking(X, v=v,  frequency_masking_para=8, frequency_mask_num=1)
        warped_frequency_time_sepctrogram = spec_augment_tensorflow.time_masking(warped_frequency_spectrogram, tau=tau, time_masking_para=25, time_mask_num=2)

        X = warped_frequency_time_sepctrogram



        return X, y

In [None]:
# Parametersa
params = {'dim': (1, ),
          'batch_size': 32,
          'epoch_size': 0,
          'n_classes': 2,
          'shuffle': True}

# Generators
training_generator = DataGenerator(partition['train'], **params)
validation_generator = DataGenerator(partition['validation'], **params)

# Define the YOHO network

In [None]:
def my_loss_fn(y_true, y_pred):
  weight = tf.constant([1.0])

  squared_difference = tf.square(y_true - y_pred)

  ss_True = squared_difference[:, :, 0] * 0 + 1

  ss_0 = y_true[:, :, 0]
  ss_1 = y_true[:, :, 3]
  ss_2 = y_true[:, :, 6]
  ss_3 = y_true[:, :, 9]
  ss_4 = y_true[:, :, 12]
  ss_5 = y_true[:, :, 15]

  sss = tf.stack((ss_True, ss_0, ss_0,
                  ss_True, ss_1, ss_1,
                  ss_True, ss_2, ss_2,
                  ss_True, ss_3, ss_3,
                  ss_True, ss_4, ss_4,
                  ss_True, ss_5, ss_5), axis = 2)
  
  squared_difference =  tf.multiply(squared_difference, sss)

  return tf.reduce_sum(squared_difference, axis=[-1, -2])  # Note the `axis=-1`

In [None]:
import tensorflow as tf

In [None]:
# This optimises val loss for Wave-U-Net YOHO
# Back to Val Binary acc.

import os
class MyCustomCallback_3(tf.keras.callbacks.Callback):
  def __init__(self, model_dir, patience=0):
    super(MyCustomCallback_3, self).__init__()
    self.patience = patience
    # best_weights to store the weights at which the minimum loss occurs.
    self.best_weights = None
    self.model_best_path = os.path.join(model_dir, 'model-best.h5')
    self.model_last_path = os.path.join(model_dir, 'model-last-epoch.h5')
    self.custom_params = {"best_loss":np.inf, "last_epoch":0}
    
    self.custom_params_path = os.path.join(model_dir, 'custom_params.pickle')
    if os.path.isfile(self.custom_params_path):
      with open(self.custom_params_path, 'rb') as f:
        self.custom_params = pickle.load(f)

  def on_train_begin(self, logs=None):
    # The number of epoch it has waited when loss is no longer minimum.
    self.wait = 0
    # The epoch the training stops at.
    self.stopped_epoch = 0
    # Initialize the best F1 as 0.0.
    self.is_impatient = False

  def on_train_end(self, logs=None):
    if not self.is_impatient:
      print("Restoring model weights from the end of the best epoch.")
      self.model.set_weights(self.best_weights)
      # temp_model_path = self.model_path.replace(".h5", "_temp.h5")
      #os.remove(temp_model_path)

  def on_epoch_end(self, epoch, logs=None):
    current_val_loss = logs.get("val_loss")
    self.model.save_weights(self.model_last_path)
    self.custom_params["last_epoch"] = self.custom_params["last_epoch"] + 1

    if current_val_loss < self.custom_params['best_loss']:
      self.custom_params['best_loss'] = current_val_loss
      self.wait = 0
      self.best_weights = self.model.get_weights()
      self.model.save_weights(self.model_best_path)

    else:
        self.wait += 1
        if self.wait >= self.patience:
            self.stopped_epoch = epoch
            self.is_impatient = True
            self.model.stop_training = True
            print("Restoring model weights from the end of the best epoch.")
            self.model.set_weights(self.best_weights)
            #os.remove(temp_model_path)
    with open(self.custom_params_path, 'wb') as f:
      pickle.dump(self.custom_params, f, pickle.HIGHEST_PROTOCOL)

In [None]:
list(enumerate(fold1_val_files))

In [None]:
len(fold1_val_files)

In [None]:
!pip install sed_eval

In [None]:
import sed_eval
import dcase_util

In [None]:
import pickle

In [None]:
# Creates mel spctrograms for training

win_length = 2.56
hop_size = 1.96
mss_ins = []
win_ranges_list = []


for ii, audio in enumerate(fold1_val_files):
  a, win_ranges = construct_examples(audio, win_len=win_length,hop_len=hop_size)

  mss_in = np.zeros((len(a), 257, 40))

  preds = np.zeros((len(a), 9, 18))


  for i in range(len(a)):
    M = get_log_melspectrogram(a[i])
    mss_in[i, :, :] = M.T
  mss_ins.append(mss_in)
  win_ranges_list.append(win_ranges)

def mk_preds_YOHO_mel(model, ind, mss_ins=mss_ins, no_of_div = 9, hop_size = 1.96, discard = 0.3, win_length = 2.56, max_event_silence = 0.3, sampling_rate = 44100):
  preds = model.predict(mss_ins[ind])
  events = []

  for i in range(len(preds)):
    p = preds[i, :, :]
    events_curr = []
    win_width = win_length / no_of_div
    for j in range(len(p)):
      for jjj in range(0, 6):
        if p[j][jjj*3] >= 0.5:
          start = win_width * j + win_width * p[j][jjj*3+1] + win_ranges_list[ind][i][0]
          end = p[j][jjj*3+2] * win_width + start
          events_curr.append([start, end, rev_class_dict[jjj]])

    events += events_curr


  class_set = set([c[2] for c in events])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in events:
    class_wise_events[c[2]].append(c)
    
  
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events


import os

In [None]:
rev_class_dict = ['brakes squeaking',
              'car',
              'children',
              'large vehicle',
              'people speaking',
              'people walking']

In [None]:
def frames_to_time(f, sr = 44100.0, hop_size = 441):
  return f * hop_size / sr

def preds_to_se(p, win_start, audio_clip_length = 2.56):
  start_dicts = [-100, -100, -100, -100, -100, -100]
  stop_dicts = [-100, -100, -100, -100, -100, -100]


  start_speech = -100
  start_music = -100
  stop_speech = -100
  stop_music = -100

  audio_events = []

  n_frames = p.shape[0]

  for j in range(p.shape[1]):
    if p[0, j] >= 0.5:
      start_dicts[j] = 0

  for j in range(p.shape[1]):
    for i in range(n_frames - 1):
      if p[i, j] < 0.5 and p[i+1, j] >= 0.5:
        start_dicts[j] = i+1

      elif p[i, j] >= 0.5 and p[i + 1, j] < 0.5:
        stop_dicts[j] = i
        start_time = frames_to_time(start_dicts[j])
        stop_time = frames_to_time(stop_dicts[j])

        audio_events.append([start_time+win_start, stop_time+win_start, rev_class_dict[j]])
        start_dicts[j] = -100
        stop_dicts[j] = -100

    if start_dicts[j] != -100:
      start_time = frames_to_time(start_dicts[j])
      stop_time = audio_clip_length
      audio_events.append([start_time+win_start, stop_time+win_start, rev_class_dict[j]])
      start_dicts[j] = -100
      stop_dicts[j] = -100

  audio_events.sort(key = lambda x: x[0]) 
  return audio_events

In [None]:
def extract_labels_2(annotation_path):
  events = read_annotation(annotation_path)

  ann = [[float(e[2]), float(e[3]), e[4]] for e in events]
  
  n_label = "/content/eval-files-2/" + os.path.basename(annotation_path)

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in ann))

In [None]:
shutil.rmtree('/content/eval-files-2/', ignore_errors=True)
os.mkdir("/content/eval-files-2/")

In [None]:
for audio in fold1_val_files:
  extract_labels_2(audio.replace(".wav", ".ann").replace("audio-mono", "meta"))

In [None]:
class MyCustomCallback_44(tf.keras.callbacks.Callback):
  def __init__(self):
    super(MyCustomCallback_44, self).__init__()
    self.best_f1 = 0.0
    self.best_error = np.inf

    
  def on_train_begin(self, logs=None):
    pass

  def on_train_end(self, logs=None):
    pass

  def on_epoch_end(self, epoch, logs=None):
    if epoch > 1:
      for ii, audio in enumerate(fold1_val_files):
        audio_file_path = audio
        see = mk_preds_YOHO_mel(self.model, ii)
        n_label = n_label = "/content/eval-files-2/" + os.path.basename(audio_file_path).replace(".wav" ,"") + "-se-prediction.ann"

        with open(n_label, 'w') as fp:
          fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))

      destination = "/content/eval-files-2/"
      test_set = glob.glob(destination + "*[0-9].ann")

      eval_path = "/content/"


      file_list = [
          {
          'reference_file': tt,
          'estimated_file': tt.replace(".ann","-se-prediction.ann")
          }
          for tt in test_set
      ]

      data = []

      # Get used event labels
      all_data = dcase_util.containers.MetaDataContainer()
      for file_pair in file_list:
          reference_event_list = sed_eval.io.load_event_list(
              filename=file_pair['reference_file']
          )
          estimated_event_list = sed_eval.io.load_event_list(
              filename=file_pair['estimated_file']
          )

          data.append({'reference_event_list': reference_event_list,
                      'estimated_event_list': estimated_event_list})

          all_data += reference_event_list

      event_labels = all_data.unique_event_labels

      # Start evaluating

      # Create metrics classes, define parameters
      segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
          event_label_list=event_labels,
          time_resolution=1.0
      )

      event_based_metrics = sed_eval.sound_event.EventBasedMetrics(
          event_label_list=event_labels,
          t_collar=1.0
      )

      # Go through files
      for file_pair in data:
          segment_based_metrics.evaluate(
              reference_event_list=file_pair['reference_event_list'],
              estimated_event_list=file_pair['estimated_event_list']
          )

          event_based_metrics.evaluate(
              reference_event_list=file_pair['reference_event_list'],
              estimated_event_list=file_pair['estimated_event_list']
          )

      # Get only certain metrics
      overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
      curr_f1 = overall_segment_based_metrics['f_measure']['f_measure']
      curr_error = overall_segment_based_metrics['error_rate']['error_rate']

      if curr_f1 > self.best_f1:
        self.best_f1 = curr_f1
        self.model.save_weights("/content/model-best-f1.h5")

      if curr_error < self.best_error:
        self.best_error = curr_error
        self.model.save_weights("/content/model-best-error.h5")

      print("F-measure: {:.3f} vs {:.3f}".format(curr_f1, self.best_f1))
      print("Error rate: {:.3f} vs {:.3f}".format(curr_error, self.best_error))

      # Or print all metrics as reports

In [None]:
LAYER_DEFS = [
    # (layer_function, kernel, stride, num_filters)
    ([3, 3], 1,   64),
    ([3, 3], 2,  128),
    ([3, 3], 1,  128),
    ([3, 3], 2,  256),
    ([3, 3], 1,  256),
    ([3, 3], 2,  512),
    ([3, 3], 1,  512),
    ([3, 3], 1,  512),
    ([3, 3], 1,  512),
    ([3, 3], 1,  512),
    ([3, 3], 1,  512),
    ([3, 3], 2, 1024),
    ([3, 3], 1, 1024),
    ([3, 3], 1, 512),
    ([3, 3], 1, 256),
    ([3, 3], 1, 128),
    # ([3, 3], 1, 128),
    # ([3, 3], 1, 128)
]

In [None]:
from tensorflow.keras import regularizers

In [None]:
from keras.regularizers import l2

In [None]:
"""
Manually define YOHO network
"""

# params = yamnet_params.Params()
m_features = tf.keras.Input(shape=(257, 40), name="mel_input")
X = m_features
X = tf.keras.layers.Reshape((257, 40, 1))(X)
X = tf.keras.layers.Conv2D(filters = 32, kernel_size=[3, 3], strides=2, padding='same', use_bias=False,
                           activation=None, name = "layer1/conv",
                             kernel_regularizer=l2(1e-3), bias_regularizer=l2(1e-3))(X)
X = tf.keras.layers.BatchNormalization(center=True, scale=False, epsilon=1e-4, name = "layer1/bn")(X)
X = tf.keras.layers.ReLU(name="layer1/relu")(X)

# X = tf.keras.layers.SpatialDropout2D(0.5)(X)

for i in range(len(LAYER_DEFS)):
  X = tf.keras.layers.DepthwiseConv2D(kernel_size=LAYER_DEFS[i][0], strides = LAYER_DEFS[i][1], depth_multiplier=1, padding='same', use_bias=False,
                                      activation=None, name="layer"+ str(i + 2)+"/depthwise_conv")(X)
  X = tf.keras.layers.BatchNormalization(center=True, scale=False, epsilon=1e-4, name = "layer"+ str(i + 2)+"/depthwise_conv/bn")(X)
  X = tf.keras.layers.ReLU(name="layer"+ str(i + 2)+"/depthwise_conv/relu")(X)
  X = tf.keras.layers.Conv2D(filters = LAYER_DEFS[i][2], kernel_size=[1, 1], strides=1, padding='same', use_bias=False, activation=None,
                             name = "layer"+ str(i + 2)+"/pointwise_conv",
                             kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01))(X)
  X = tf.keras.layers.BatchNormalization(center=True, scale=False, epsilon=1e-4, name = "layer"+ str(i + 2)+"/pointwise_conv/bn")(X)
  X = tf.keras.layers.ReLU(name="layer"+ str(i + 2)+"/pointwise_conv/relu")(X)

  X = tf.keras.layers.SpatialDropout2D(0.1)(X)

_, _, sx, sy = X.shape
X = tf.keras.layers.Reshape((-1, int(sx * sy)))(X)
pred = tf.keras.layers.Conv1D(18,kernel_size=1, activation="sigmoid")(X)
model = tf.keras.Model(
      name='yamnet_frames', inputs=m_features,
      outputs=[pred])

In [None]:
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=my_loss_fn)

In [None]:
"""
Manually stop the training if the the val. error rate does not decrease for 100 epochs. 
"""

model.fit(training_generator, validation_data=validation_generator, epochs=1000, callbacks=[MyCustomCallback_44()], verbose=1)

In [None]:
model.load_weights("/content/model-best-error.h5")

In [None]:
model.save_weights("/content/TUT-sound-events-2017/YOHO-fold1.h5")

# Go back the cell titled 'Split into folds'. Replace 'fold1' with 'fold1' for all occurrences in the notebook. Save the models separately in the 'TUT-sound-events-2017' folder. After training models for all the four folds, the below code blocks perform evaluation on the test set using an ensemble. 

In [None]:
models = []
for i in range(4):
  models.append(model)

# Testing

In [None]:
def smoothe_events(events):

  ann = events

  curr_ann = ann

  class_set = set([c[2] for c in curr_ann])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in curr_ann:
    class_wise_events[c[2]].append(c)
    
  max_event_silence = 1.0
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events

In [None]:
rev_class_dict = ['brakes squeaking',
              'car',
              'children',
              'large vehicle',
              'people speaking',
              'people walking']

In [None]:
def mk_preds_YOHO(model, audio_path, no_of_div = 9, hop_size = 1.96, discard = 0.3, win_length = 2.56, max_event_silence = 0.3, sampling_rate = 44100):
  a, win_ranges = construct_examples(audio_path, win_len=win_length,hop_len=hop_size)

  preds = np.zeros((len(a), 9, 18))
  mss_in = np.zeros((len(a), 257, 40))

  for i in range(len(a)):
    M = get_log_melspectrogram(a[i])
    mss_in[i, :, :] = M.T

  preds = model.predict(mss_in)
  events = []

  for i in range(len(preds)):
    p = preds[i, :, :]
    events_curr = []
    win_width = win_length / no_of_div
    for j in range(len(p)):
      for jjj in range(0, 6):
        if p[j][jjj*3] >= 0.5:
          start = win_width * j + win_width * p[j][jjj*3+1] + win_ranges[i][0]
          end = p[j][jjj*3+2] * win_width + start
          events_curr.append([start, end, rev_class_dict[jjj]])

    events += events_curr


  class_set = set([c[2] for c in events])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in events:
    class_wise_events[c[2]].append(c)
    
  
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events



In [None]:
def frames_to_time(f, sr = 44100.0, hop_size = 441):
  return f * hop_size / sr

def preds_to_se(p, win_start, audio_clip_length = 2.56):
  start_dicts = [-100, -100, -100, -100, -100, -100]
  stop_dicts = [-100, -100, -100, -100, -100, -100]


  start_speech = -100
  start_music = -100
  stop_speech = -100
  stop_music = -100

  audio_events = []

  n_frames = p.shape[0]

  for j in range(p.shape[1]):
    if p[0, j] >= 0.5:
      start_dicts[j] = 0

  for j in range(p.shape[1]):
    for i in range(n_frames - 1):
      if p[i, j] < 0.5 and p[i+1, j] >= 0.5:
        start_dicts[j] = i+1

      elif p[i, j] >= 0.5 and p[i + 1, j] < 0.5:
        stop_dicts[j] = i
        start_time = frames_to_time(start_dicts[j])
        stop_time = frames_to_time(stop_dicts[j])

        audio_events.append([start_time+win_start, stop_time+win_start, rev_class_dict[j]])
        start_dicts[j] = -100
        stop_dicts[j] = -100

    if start_dicts[j] != -100:
      start_time = frames_to_time(start_dicts[j])
      stop_time = audio_clip_length
      audio_events.append([start_time+win_start, stop_time+win_start, rev_class_dict[j]])
      start_dicts[j] = -100
      stop_dicts[j] = -100

  audio_events.sort(key = lambda x: x[0]) 
  return audio_events



In [None]:
# in_signal, in_sr = sf.read("")

# Resample the audio file.
win_length = 2.56
no_of_div = 9

audio_clip_length_samples = in_signal.shape[0]
print('audio_clip_length_samples is {}'.format(audio_clip_length_samples))

a, win_ranges = construct_examples("/content/development/TUT-sound-events-2017-development/audio-mono/street/a001.wav",hop_len=1.96)

preds = np.zeros((len(a), 9, 18))
mss_in = np.zeros((len(a), 257, 40))

for i in range(len(a)):
  M = get_log_melspectrogram(a[i])
  mss_in[i, :, :] = M.T

preds = model.predict(mss_in)
events = []

for i in range(len(preds)):
  p = preds[i, :, :]
  events_curr = []
  win_width = win_length / no_of_div
  for j in range(len(p)):
    for jjj in range(0, 6):
      if p[j][jjj*3] >= 0.5:
        start = win_width * j + win_width * p[j][jjj*3+1] + win_ranges[i][0]
        end = p[j][jjj*3+2] * win_width + start
        events_curr.append([start, end, rev_class_dict[jjj]])

  events += events_curr


class_set = set([c[2] for c in events])
class_wise_events = {}

for c in list(class_set):
  class_wise_events[c] = []


for c in events:
  class_wise_events[c[2]].append(c)
  
max_event_silence = 1.0
all_events = []

for k in list(class_wise_events.keys()):
  curr_events = class_wise_events[k]
  count = 0

  while count < len(curr_events) - 1:
    if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
      curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
      del curr_events[count + 1]
    else:
      count += 1

  all_events += curr_events

for i in range(len(all_events)):
  all_events[i][0] = round(all_events[i][0], 3)
  all_events[i][1] = round(all_events[i][1], 3)

all_events.sort(key=lambda x: x[0])


In [None]:
"""
Make predictions for full audio --- vectorised implementation.
"""

def mk_preds_vector(audio_path, no_of_div = 9, hop_size = 1.96, discard = 0.3, win_length = 2.56, sampling_rate = 44100):
  in_signal, in_sr = sf.read(audio_path)

  # Resample the audio file.


  audio_clip_length_samples = in_signal.shape[0]
  print('audio_clip_length_samples is {}'.format(audio_clip_length_samples))

  hop_size_samples = int(hop_size * sampling_rate)
  # hop_size_samples = 220 * 602 - 1

  win_length_samples = int(win_length * sampling_rate)
  # win_length_samples = 220 * 802 - 1

  n_preds = int(math.ceil((audio_clip_length_samples - win_length_samples) / hop_size_samples)) + 1

  # n_preds = int()

  #print('n_preds is {}'.format(n_preds))

  in_signal_pad = np.zeros(((n_preds - 1) * hop_size_samples) + win_length_samples)
  # in_signal_pad = np.zeros((n_preds * hop_size_samples + 200 * 220))

  #print('in_signal_pad.shape is {}'.format(in_signal_pad.shape))

  in_signal_pad[0:audio_clip_length_samples] = in_signal

  preds = np.zeros((n_preds, 9, 18))
  mss_in = np.zeros((n_preds, 257, 40))
  events = []

  for i in range(n_preds):
    seg = in_signal_pad[i * hop_size_samples:(i * hop_size_samples) + win_length_samples]
    #print('seg.shape is {}'.format(seg.shape))

    mss = get_log_melspectrogram(seg)
    M = mss.T
    mss_in[i, :, :] = M

  preds = model.predict(mss_in)
  # preds[:, 0] = (p[:, 0] >= 0.5).astype(np.float)
  # preds[:, 2] = (p[:, 2] >= 0.5).astype(np.float)

  events = []

  for j in range(n_preds):
    p = preds[j, :, :]
    events_curr = []
    win_width = win_length / no_of_div
    for i in range(len(p)):
      for jjj in range(0, 6):
        if p[i][jjj*3] >= 0.5:
          start = win_width * i + win_width * p[i][1]
          end = p[i][2] * win_width + start
          events_curr.append([start, end, rev_class_dict[jjj]])

    se = events_curr
    if j == 0:
      start = 0.0
      end = start + win_length
      if preds.shape[0] > 1:
        end -= discard

      # print("start: {}   end: {}".format(start, end))
    elif j == n_preds - 1:
      start = j * hop_size + discard
      end = start - discard + win_length
      # print("start: {}   end: {}".format(start, end))

    else:
      start = j * hop_size + discard
      end = start + win_length - discard
      # print("start: {}   end: {}".format(start, end))
    
    for k in range(len(se)):
      se[k][0] = max(start, se[k][0] + j * hop_size)
      se[k][1] = min(end, se[k][1] + j * hop_size)

    # print(se)


    for see in se:
     events.append(see) 
    
  # print(events)
  smooth_events = smoothe_events(events)

  return smooth_events

In [None]:
mk_preds_YOHO("/content/development/TUT-sound-events-2017-development/audio-mono/street/a001.wav")

In [None]:
a, win_ranges = construct_examples("/content/development/TUT-sound-events-2017-development/audio-mono/street/a001.wav", win_len=2.56,hop_len=1.96)

max_event_silence = 0.3
preds = np.zeros((len(a), 9, 18))
mss_in = np.zeros((len(a), 257, 40))

for i in range(len(a)):
  M = get_log_melspectrogram(a[i])
  mss_in[i, :, :] = M.T

preds = model.predict(mss_in)
events = []

for i in range(len(preds)):
  p = preds[i, :, :]
  events_curr = []
  # win_width = win_length / no_of_div

  events_curr = preds_to_se(p, win_start = win_ranges[i][0], audio_clip_length=win_length)

  events += events_curr

print(events)

class_set = set([c[2] for c in events])
class_wise_events = {}

for c in list(class_set):
  class_wise_events[c] = []


for c in events:
  class_wise_events[c[2]].append(c)
  

all_events = []

for k in list(class_wise_events.keys()):
  curr_events = class_wise_events[k]
  count = 0

  while count < len(curr_events) - 1:
    if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
      curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
      del curr_events[count + 1]
    else:
      count += 1

  all_events += curr_events

for i in range(len(all_events)):
  all_events[i][0] = round(all_events[i][0], 3)
  all_events[i][1] = round(all_events[i][1], 3)

all_events.sort(key=lambda x: x[0])

print(all_events)


In [None]:
preds_to_se((model.predict(mss_in) >= 0.5).astype(np.float), win_start=0.0)

# Extract annotations

In [None]:
import os.path

In [None]:
def extract_labels(annotation_path):
  events = read_annotation(annotation_path)

  ann = [[float(e[2]), float(e[3]), e[4]] for e in events]
  
  n_label = "/content/eval-files/" + os.path.basename(annotation_path)

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in ann))

In [None]:
os.mkdir("/content/eval-files/")

In [None]:
for audio in fold1_val_files:
  extract_labels(audio.replace(".wav", ".ann").replace("audio-mono", "meta"))

In [None]:
fold1_val_files

In [None]:
fold1_val_files

In [None]:
model.load_weights("/content/model-best-error-YamNet-fold1.h5")

In [None]:
for audio in fold1_val_files:
  audio_file_path = audio
  see = mk_preds_YOHO(model, audio_file_path)
  n_label = n_label = "/content/eval-files/" + os.path.basename(audio_file_path).replace(".wav" ,"") + "-se-prediction.ann"

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))

In [None]:
destination = "/content/eval-files/"
test_set = glob.glob(destination + "*[0-9].ann")

print(test_set)

eval_path = "/content/"


file_list = [
    {
    'reference_file': tt,
    'estimated_file': tt.replace(".ann","-se-prediction.ann")
    }
    for tt in test_set
]

data = []

# Get used event labels
all_data = dcase_util.containers.MetaDataContainer()
for file_pair in file_list:
    reference_event_list = sed_eval.io.load_event_list(
        filename=file_pair['reference_file']
    )
    estimated_event_list = sed_eval.io.load_event_list(
        filename=file_pair['estimated_file']
    )

    data.append({'reference_event_list': reference_event_list,
                'estimated_event_list': estimated_event_list})

    all_data += reference_event_list

event_labels = all_data.unique_event_labels

# Start evaluating

# Create metrics classes, define parameters
segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=event_labels,
    time_resolution=1.0
)

event_based_metrics = sed_eval.sound_event.EventBasedMetrics(
    event_label_list=event_labels,
    t_collar=1.0
)

# Go through files
for file_pair in data:
    segment_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

    event_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

# Get only certain metrics
overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
print("Accuracy:", overall_segment_based_metrics['accuracy']['accuracy'])

# Or print all metrics as reports

model_basename = "YamNet-fold1.h5"
seg_eval_basename = "seg eval " + model_basename.replace(".h5", "") + ".txt"
ev_eval_basename = "event eval " + model_basename.replace(".h5", "") + ".txt"
with open(os.path.join(eval_path, seg_eval_basename), mode='w') as fp:
  fp.write(str(segment_based_metrics))

with open(eval_path + "/seg eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(segment_based_metrics, f, pickle.HIGHEST_PROTOCOL)

with open(os.path.join(eval_path, ev_eval_basename), mode = 'w') as fp:
  fp.write(str(event_based_metrics))

with open(eval_path + "/event eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(event_based_metrics, f, pickle.HIGHEST_PROTOCOL)   

In [None]:
files = glob.glob("/content/SACRNN/*")

In [None]:
files

In [None]:
for f in files:
    shutil.move(f, '/content/SACRNN/')

In [None]:
with ZipFile("/content/drive/MyDrive/TUT-sound-events-2017/Models/2-7-21/SACRNN.zip", 'w') as my_zip:
  for f in files:
    my_zip.write(f)

In [None]:
overall_segment_based_metrics['f_measure']['f_measure']

In [None]:
!nvidia-smi

# Test on final Evaluation set

In [None]:
def mk_ens_preds_YOHO(models, audio_path, no_of_div = 9, hop_size = 1.96, discard = 0.3, win_length = 2.56, max_event_silence = 0.3, sampling_rate = 44100):
  a, win_ranges = construct_examples(audio_path, win_len=win_length,hop_len=hop_size)

  preds = np.zeros((len(a), 9, 18))
  mss_in = np.zeros((len(a), 257, 40))

  for i in range(len(a)):
    M = get_log_melspectrogram(a[i])
    mss_in[i, :, :] = M.T

  ensemble_preds = []

  yhats = []

  # v = X.shape[0]
  # tau = X.shape[1]

  # warped_frequency_spectrogram = spec_augment_tensorflow.frequency_masking(X, v=v,  frequency_masking_para=8, frequency_mask_num=1)
  # warped_frequency_time_sepctrogram = spec_augment_tensorflow.time_masking(warped_frequency_spectrogram, tau=tau, time_masking_para=25, time_mask_num=2)

  # X = warped_frequency_time_sepctrogram

  for model in models:
    for i in range(10):
      v = mss_in.shape[0]
      tau = mss_in.shape[1]

      X = mss_in.reshape((-1, 257, 40, 1))

      warped_frequency_spectrogram = spec_augment_tensorflow.frequency_masking(X, v=v,  frequency_masking_para=8, frequency_mask_num=1)
      warped_frequency_time_sepctrogram = spec_augment_tensorflow.time_masking(warped_frequency_spectrogram, tau=tau, time_masking_para=25, time_mask_num=2)

      X = warped_frequency_time_sepctrogram

      yhats.append(model.predict(X))


  # yhats = [model.predict(mss_in) for model in models]
  yhats = np.array(yhats)
  # sum across ensembles
  preds = np.mean(yhats, axis=0)

  # preds = model.predict(mss_in)
  events = []

  for i in range(len(preds)):
    p = preds[i, :, :]
    events_curr = []
    win_width = win_length / no_of_div
    for j in range(len(p)):
      for jjj in range(0, 6):
        if p[j][jjj*3] >= 0.5:
          start = win_width * j + win_width * p[j][jjj*3+1] + win_ranges[i][0]
          end = p[j][jjj*3+2] * win_width + start
          events_curr.append([start, end, rev_class_dict[jjj]])

    events += events_curr


  class_set = set([c[2] for c in events])
  class_wise_events = {}

  for c in list(class_set):
    class_wise_events[c] = []


  for c in events:
    class_wise_events[c[2]].append(c)
    
  
  all_events = []

  for k in list(class_wise_events.keys()):
    curr_events = class_wise_events[k]
    count = 0

    while count < len(curr_events) - 1:
      if (curr_events[count][1] >= curr_events[count + 1][0]) or (curr_events[count + 1][0] - curr_events[count][1] <= max_event_silence):
        curr_events[count][1] = max(curr_events[count + 1][1], curr_events[count][1])
        del curr_events[count + 1]
      else:
        count += 1

    all_events += curr_events

  for i in range(len(all_events)):
    all_events[i][0] = round(all_events[i][0], 3)
    all_events[i][1] = round(all_events[i][1], 3)

  all_events.sort(key=lambda x: x[0])

  return all_events



In [None]:
os.mkdir("/content/eval-files-final")

In [None]:
audio_files = glob.glob("/content/evaluation/TUT-sound-events-2017-evaluation/audio/street/*.wav")

In [None]:
os.makedirs(dirname(audio_files[0]).replace("audio", "audio-mono"))

In [None]:
for sound in audio_files:
  temp_file = sound.replace("audio", "audio-mono")
  command = command = "sox " + sound + " " + temp_file + " channels 1"
  p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
  output, err = p.communicate()

In [None]:
audio_files_mono = glob.glob("/content/evaluation/TUT-sound-events-2017-evaluation/audio-mono/street/*.wav")

In [None]:
test_files = glob.glob("/content/evaluation/TUT-sound-events-2017-evaluation/audio-mono/street/*.wav")

In [None]:
def extract_labels_3(annotation_path):
  events = read_annotation(annotation_path)

  ann = [[float(e[0]), float(e[1]), e[2]] for e in events]
  
  n_label = "/content/eval-files-final/" + os.path.basename(annotation_path)

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in ann))

In [None]:
for audio in test_files:
  extract_labels_3(audio.replace(".wav", ".ann").replace("audio-mono", "meta"))


In [None]:
# os.mkdir("/content/drive/MyDrive/TUT-sound-events-2017/Models/2-7-21/")

In [None]:
zip_name = "/content/drive/MyDrive/TUT-sound-events-2017/Models/2-7-21/SACRNN.zip"
with ZipFile(zip_name, 'r') as zip:
  zip.extractall()

In [None]:
for i in range(4):
  models[i].load_weights("/content/content/SACRNN/model-best-error-SACRNN-fold" + str(i+1) + ".h5")

In [None]:
for audio in test_files:
  audio_file_path = audio
  see = mk_ens_preds_CRNN(models, audio_file_path)
  n_label = n_label = "/content/eval-files-final/" + os.path.basename(audio_file_path).replace(".wav" ,"") + "-se-prediction.ann"

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))


In [None]:
destination = "/content/eval-files-final/"
test_set = glob.glob(destination + "*[0-9].ann")

print(test_set)

eval_path = "/content/"


file_list = [
    {
    'reference_file': tt,
    'estimated_file': tt.replace(".ann","-se-prediction.ann")
    }
    for tt in test_set
]

data = []

# Get used event labels
all_data = dcase_util.containers.MetaDataContainer()
for file_pair in file_list:
    reference_event_list = sed_eval.io.load_event_list(
        filename=file_pair['reference_file']
    )
    estimated_event_list = sed_eval.io.load_event_list(
        filename=file_pair['estimated_file']
    )

    data.append({'reference_event_list': reference_event_list,
                'estimated_event_list': estimated_event_list})

    all_data += reference_event_list

event_labels = all_data.unique_event_labels

# Start evaluating

# Create metrics classes, define parameters
segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=event_labels,
    time_resolution=1.0
)

event_based_metrics = sed_eval.sound_event.EventBasedMetrics(
    event_label_list=event_labels,
    t_collar=1.0
)

# Go through files
for file_pair in data:
    segment_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

    event_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

# Get only certain metrics
overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
print("Accuracy:", overall_segment_based_metrics['accuracy']['accuracy'])

# Or print all metrics as reports

model_basename = "SACRNN-ensemble-no-tta-0_3.h5"
seg_eval_basename = "seg eval " + model_basename.replace(".h5", "") + ".txt"
ev_eval_basename = "event eval " + model_basename.replace(".h5", "") + ".txt"
with open(os.path.join(eval_path, seg_eval_basename), mode='w') as fp:
  fp.write(str(segment_based_metrics))

with open(eval_path + "/seg eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(segment_based_metrics, f, pickle.HIGHEST_PROTOCOL)

with open(os.path.join(eval_path, ev_eval_basename), mode = 'w') as fp:
  fp.write(str(event_based_metrics))

with open(eval_path + "/event eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(event_based_metrics, f, pickle.HIGHEST_PROTOCOL)   

# On Evaluation

In [None]:
def extract_labels(annotation_path):
  events = read_annotation(annotation_path)

  ann = [[float(e[2]), float(e[3]), e[4]] for e in events]
  
  n_label = "/content/eval-files-2/" + os.path.basename(annotation_path)

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in ann))

In [None]:
# os.mkdir("/content/eval-files/")

In [None]:
for audio in fold1_val_files:
  extract_labels(audio.replace(".wav", ".ann").replace("audio-mono", "meta"))

In [None]:
for audio in fold1_val_files:
  audio_file_path = audio
  see = mk_preds_vector(audio_file_path)
  n_label = n_label = "/content/eval-files/" + os.path.basename(audio_file_path).replace(".wav" ,"") + "-se-prediction.ann"

  with open(n_label, 'w') as fp:
    fp.write('\n'.join('{},{},{}'.format(round(x[0], 5), round(x[1], 5), x[2]) for x in see))

In [None]:
# !pip install sed_eval

In [None]:
import sed_eval
import dcase_util

In [None]:
import pickle

In [None]:
destination = "/content/eval-files/"
test_set = glob.glob(destination + "*[0-9].ann")

print(test_set)

eval_path = "/content/"


file_list = [
    {
    'reference_file': tt,
    'estimated_file': tt.replace(".ann","-se-prediction.ann")
    }
    for tt in test_set
]

data = []

# Get used event labels
all_data = dcase_util.containers.MetaDataContainer()
for file_pair in file_list:
    reference_event_list = sed_eval.io.load_event_list(
        filename=file_pair['reference_file']
    )
    estimated_event_list = sed_eval.io.load_event_list(
        filename=file_pair['estimated_file']
    )

    data.append({'reference_event_list': reference_event_list,
                'estimated_event_list': estimated_event_list})

    all_data += reference_event_list

event_labels = all_data.unique_event_labels

# Start evaluating

# Create metrics classes, define parameters
segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=event_labels,
    time_resolution=1.0
)

event_based_metrics = sed_eval.sound_event.EventBasedMetrics(
    event_label_list=event_labels,
    t_collar=1.0
)

# Go through files
for file_pair in data:
    segment_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

    event_based_metrics.evaluate(
        reference_event_list=file_pair['reference_event_list'],
        estimated_event_list=file_pair['estimated_event_list']
    )

# Get only certain metrics
overall_segment_based_metrics = segment_based_metrics.results_overall_metrics()
print("Accuracy:", overall_segment_based_metrics['accuracy']['accuracy'])

# Or print all metrics as reports

model_basename = "YamNet-4.h5"
seg_eval_basename = "seg eval " + model_basename.replace(".h5", "") + ".txt"
ev_eval_basename = "event eval " + model_basename.replace(".h5", "") + ".txt"
with open(os.path.join(eval_path, seg_eval_basename), mode='w') as fp:
  fp.write(str(segment_based_metrics))

with open(eval_path + "/seg eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(segment_based_metrics, f, pickle.HIGHEST_PROTOCOL)

with open(os.path.join(eval_path, ev_eval_basename), mode = 'w') as fp:
  fp.write(str(event_based_metrics))

with open(eval_path + "/event eval " + model_basename.replace(".h5", "") + ".pickle", 'wb') as f:
  pickle.dump(event_based_metrics, f, pickle.HIGHEST_PROTOCOL)   