#Import necessary libraries

In [None]:
# !pip install numpy==1.23.1

In [None]:
import librosa
import os
import h5py
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
from tensorflow import keras
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Define paths and load the data, setup the label binarizer

In [None]:
path = '/content/drive/MyDrive/GMM Group'
model_path = os.path.join(path, 'model0516_3noaug.h5')
hdf5_file = h5py.File(os.path.join(path, 'EPIC_audio.hdf5'), 'r')
label_binarizer = MultiLabelBinarizer(classes=np.arange(44))

#Define a generator function (base_generator) to yield audio segments and their labels

In [None]:
def base_generator(annotation_file, hdf5_file, segment_length=3, step=1, sample_rate=24000, augment=False):
  annotations = pd.read_csv(os.path.join(path, annotation_file))
  segment_length_with_sample_rate = segment_length * sample_rate
  for key in annotations['video_id'].unique():
    audio_data = hdf5_file[key]
    file_annotations = annotations[annotations['video_id'] == key]
    frames = tf.signal.frame(audio_data, segment_length*sample_rate, sample_rate, axis=0, pad_end=True)
    for index, frame in enumerate(frames):
      start_segment = index * step * sample_rate
      stop_segment = start_segment + segment_length * sample_rate
      
      # if class is shorter than one second:
      full_sample = ((file_annotations["stop_sample"] - file_annotations["start_sample"]) < 24000) & \
              (file_annotations["start_sample"] >= start_segment) & \
              (file_annotations["stop_sample"] <= stop_segment)

      # if class is longer than one second:
      part_sample = ((file_annotations["stop_sample"] - file_annotations["start_sample"]) >= 24000) & \
              ((stop_segment - file_annotations['start_sample'] > 24000) & (file_annotations['stop_sample'] - start_segment > 24000))

      frame = frame.numpy()

      if augment:
        choice = random.choice([1,2,3])
        if choice == 1:
          frame = librosa.effects.pitch_shift(frame, sr=sample_rate, n_steps=4)
        elif choice == 2:
          noise_factor = 0.005
          white_noise = np.random.randn(len(frame)) * noise_factor
          frame = frame + white_noise
          
      filtered_samples = file_annotations[full_sample | part_sample]
      class_ids = np.expand_dims(filtered_samples['class_id'].to_numpy(), axis = 0)
      label_vector = label_binarizer.fit_transform(class_ids).reshape((44))
      
      
      spectrogram = librosa.feature.melspectrogram(y=frame, sr=sample_rate, n_mels=128)
      log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
      log_spectrogram = np.expand_dims(log_spectrogram, axis=2)

      yield (log_spectrogram, label_vector)

#Define train, validation generators.

In [None]:
def train_generator():
  return base_generator('EPIC_Sounds_train_full.csv', hdf5_file, augment=False)

In [None]:
def validation_generator():
  return base_generator('EPIC_Sounds_validation_full.csv', hdf5_file)

#Create train, validation datasets

In [None]:
# Define output shapes and types for your data and labels
output_shapes = ((128, 141, 1), (44),)
output_types = (tf.float32, tf.float32)

# Create dataset from generator
train_dataset = tf.data.Dataset.from_generator(train_generator, output_shapes=output_shapes, output_types=output_types)
val_dataset = tf.data.Dataset.from_generator(validation_generator, output_shapes=output_shapes, output_types=output_types)

In [None]:
# Optional: Set batch size and enable prefetch for performance
batch_size = 32
train_dataset = train_dataset.batch(batch_size)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
val_dataset = val_dataset.batch(batch_size)
val_dataset = val_dataset.cache()
val_dataset = val_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

#Build a model


In [None]:
from tensorflow import keras

model = keras.models.Sequential([
    # input shape (128,141,1)
    # 1st set of layers: Conv2d+BatchNormalization+Relu --> 128,141,1 becomes 128, 141, 32
    keras.layers.Conv2D(32,(5,5),padding="same",input_shape=(128,141,1)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    # 2nd set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 128, 141, 32 becomes 64, 70, 32
    keras.layers.Conv2D(32,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(),
    # 3rd set of layers: Conv2d+BatchNormalization+Relu --> 64, 70, 32 becomes 64, 70, 64
    keras.layers.Conv2D(64,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    # 4th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 64, 70, 64 becomes 32, 35, 64
    keras.layers.Conv2D(64,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # 5th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 32, 35, 64 becomes 16, 18, 128
    keras.layers.Conv2D(128,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # 5th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 16, 18, 128 becomes 8, 9, 128
    keras.layers.Conv2D(128,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # 5th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 8, 9, 128 becomes 4, 5, 256
    keras.layers.Conv2D(256,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # 5th set of layers: Conv2d+BatchNormalization+Relu+MaxPooling2D --> 4, 5, 256 becomes 2, 3, 256
    keras.layers.Conv2D(256,(3,3),padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.MaxPooling2D(padding="same"),
    # flatten the image --> 2, 3, 256 becomes 1536
    keras.layers.Flatten(),
    # go through a bunch of neurons and drop some of the links --> 1536 becomes 1024
    keras.layers.Dense(1024, activation="relu"),
    keras.layers.Dropout(0.2),
    # go through another bunch of neurons and drop some of the links --> 1024 becomes 128
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.2),
    # finally go through 44 neurons --> 128 becomes 44
    keras.layers.Dense(44, activation="sigmoid")
])
# compile model for categorical results
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 128, 141, 32)      832       
                                                                 
 batch_normalization (BatchN  (None, 128, 141, 32)     128       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 128, 141, 32)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 128, 141, 32)      9248      
                                                                 
 batch_normalization_1 (Batc  (None, 128, 141, 32)     128       
 hNormalization)                                                 
                                                                 
 activation_1 (Activation)   (None, 128, 141, 32)      0

#Train the model

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss', restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_delta=1e-4, mode='min')
]

history = model.fit(train_dataset, 
                    validation_data=val_dataset,
                    epochs=50,
                    callbacks=callbacks)

If needed load or save the model


In [None]:
model = tf.keras.models.load_model(model_path)
# model.save(model_path)

#Create test generator, test dataset

In [None]:
def test_generator():
  return base_generator('EPIC_Sounds_test_full.csv', hdf5_file)

In [None]:
test_dataset = tf.data.Dataset.from_generator(test_generator, output_shapes=output_shapes, output_types=output_types)
test_dataset = test_dataset.batch(batch_size)
test_dataset = test_dataset.cache()
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
model.evaluate(test_dataset)

740/740 [==============================] - 342s 462ms/step - loss: 0.0849 - accuracy: 0.3668

[0.0848626121878624, 0.3667624294757843]

No augmentations.

740/740 [==============================] - 347s 467ms/step - loss: 0.0767 - accuracy: 0.3772

[0.07674212008714676, 0.3771544396877289]

With augmentations.

#Calculate metrics, predictions.

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
predictions = model.predict(test_dataset)



In [None]:
y_true = []
for _, label in test_dataset:
    y_true.extend(label.numpy().tolist())
y_true = np.array(y_true)
y_true_labels = np.array([np.where(row==1)[0] for row in y_true])

  y_true_labels = np.array([np.where(row==1)[0] for row in y_true])


In [None]:
top1_predictions = np.argmax(predictions, axis=1)
top1_correct = [pred in true for pred, true in zip(top1_predictions, y_true_labels)]
np.mean(top1_correct)

0.5143629604596147

In [None]:
top5_predictions = np.argsort(predictions, axis=1)[:, -5:]
top5_correct = [any(pred in true for pred in top5_pred) for top5_pred, true in zip(top5_predictions, y_true_labels)]
np.mean(top5_correct)

0.805888813788442

In [None]:
predictions_binary = np.where(predictions > 0.5, 1, 0)

In [None]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_true, predictions_binary, average='weighted')
f1_score

0.2892981491166879

#Model with augmentations:
Top 1: 0.5306691449814126

Top 5: 0.8186887461980399

F1: 0.32247597425305213

#Model without augmentations:
Top 1: 0.5143207164582629

Top 5: 0.8060155457924975

F1: 0.2892981491166879

#Functions to 'warm-up' the gpu

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()


<tf.Tensor: shape=(), dtype=float32, numpy=3984.2676>