In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
import numpy as np

# DNN model 1 (Densenet)

Below is our own implementation of densenet, following the tutorial in
https://amaarora.github.io/2020/08/02/densenets.html

In [4]:
def dense_net(initial_feature, num_label, input_shape, 
              dense_block_config, drop_out = 0.2, bottle_necksz=4, growth_rate=32):
    
    def dense_block(input_layer, num_sets, bottle_necksz, growth_rate):
        layer_sets = [input_layer]
        for i in range(num_sets):
            if i > 0:
                input_layer = keras.layers.Concatenate()(layer_sets)
                layer_sets = []
                layer_sets.append(input_layer)
            bottleneck_1 = keras.layers.BatchNormalization()(input_layer)
            activation_1 = keras.layers.ReLU()(bottleneck_1)
            convolution_1 = keras.layers.Conv2D(bottle_necksz*growth_rate,
                                                kernel_size=(1,1), strides=1, use_bias=False)(activation_1)
            bottleneck_2 = keras.layers.BatchNormalization()(convolution_1)
            activation_2 =  keras.layers.ReLU()(bottleneck_2)
            convolution_2 = keras.layers.Conv2D(growth_rate, kernel_size=(3,3), 
                                                strides=1, padding='same', use_bias=False)(activation_2)
            layer_sets.append(convolution_2)
        return keras.layers.Concatenate()(layer_sets)

    def transition_layer(input_layer):
        batch_norm = keras.layers.BatchNormalization()(input_layer)
        activation = keras.layers.ReLU()(batch_norm)
        feature_size = keras.backend.int_shape(activation)[3]
        conv = keras.layers.Conv2D(feature_size//2, kernel_size=(1,1),strides=1,use_bias=False)(activation)
        pool = keras.layers.AveragePooling2D()(conv)
        return pool

    def fully_connected_layer(input_layer, num_labels):
        pool = keras.layers.GlobalAveragePooling2D()(input_layer)
        norm_1 = keras.layers.BatchNormalization()(pool)
        dropout = keras.layers.Dropout(.2)(norm_1)
        dense_1 = keras.layers.Dense(1024, activation='relu')(dropout)
        dense_2 = keras.layers.Dense(512, activation='relu')(dense_1)
        norm_2 = keras.layers.BatchNormalization()(dense_2)
        dropout_2 = keras.layers.Dropout(.2)(norm_2)
        return keras.layers.Dense(num_labels, activation='softmax')(dropout_2)

    inputs = keras.Input(shape = input_shape)
    # initial transition layers
    initial_padding_1 = keras.layers.ZeroPadding2D(padding=(3,3))(inputs)
    initial_conv = keras.layers.Conv2D(initial_feature, kernel_size=(7,7), 
                                       strides=2, use_bias=False)(initial_padding_1)
    initial_norm = keras.layers.BatchNormalization()(initial_conv)
    initial_relu = keras.layers.ReLU()(initial_norm)
    initial_padding_2 = keras.layers.ZeroPadding2D(padding=(1,1))(initial_relu)
    initial = keras.layers.MaxPooling2D(pool_size=(3,3), strides=2)(initial_padding_2)
    
    for num in dense_block_config:
        conv = dense_block(initial, num, bottle_necksz, growth_rate)
        initial = transition_layer(conv)

    outputs = fully_connected_layer(initial, num_label)
    return keras.Model(inputs=inputs, outputs=outputs)

## DNN model 2 (Convolutional LSTM)
Below is the Convolutional LSTM used in https://github.com/WWH98932/Audio-Classification-Models

ResNet50 is discussed in the project report. The original paper for resnet is at https://arxiv.org/pdf/1512.03385.pdf

In [65]:
from tensorflow import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.layers.wrappers import TimeDistributed
from keras import regularizers

def resnet_ldnn(num_label):
    model = Sequential()
    model.add(keras.applications.resnet50.ResNet50(include_top=False, input_shape=(128, 126, 1), 
                                                   weights=None, classes=None, pooling='average'))
    model.add(Permute((2, 1, 3)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(64, dropout=0.3, return_sequences=True))
    model.add(LSTM(64, dropout=0.3))
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(num_label, kernel_regularizer=regularizers.l2(0.03), activation='sigmoid'))
    return model

# Data Preperation

In [37]:
import pandas as pd
import pathlib
import ast

# Read data from path
data_root = pathlib.Path('sample_set\\')
manifest = pd.read_csv("sample_set\\01_manifest.csv")
manifest['tag_set'] = manifest['tag_set'].apply(ast.literal_eval)
manifest['category_set'] = manifest['category_set'].apply(ast.literal_eval)
manifest

Unnamed: 0,filename,package_hash,manual_tag,tag_set,category_set
0,101729-0-0-1.wav,u8k_fold9,air_conditioner,{air_conditioner},{mechanical}
1,103249-5-0-1.wav,u8k_fold9,engine_idling,{engine_idling},{transport_car}
2,104817-4-0-11.wav,u8k_fold2,drilling,{drilling},{mechanical_construction}
3,104998-7-16-0.wav,u8k_fold5,jackhammer,{jackhammer},{mechanical_construction}
4,104998-7-18-3.wav,u8k_fold5,jackhammer,{jackhammer},{mechanical_construction}
...,...,...,...,...,...
4959,75490-8-1-0.wav,u8k_fold6,siren,{siren},{signals_siren}
4960,76085-4-0-61.wav,u8k_fold8,drilling,{drilling},{mechanical_construction}
4961,81787-2-0-23.wav,u8k_fold8,children_playing,{children_playing},{human_voice}
4962,87275-1-3-0.wav,u8k_fold1,car_horn,{car_horn},{signals_horn}


In [52]:
from sklearn.preprocessing import MultiLabelBinarizer
final_categories = ['animal_dogs', 'animal_birds','animal_insects', 'mechanical', 'transport_car']
category_encoder = MultiLabelBinarizer().fit([final_categories])
category_encoder.classes_

array(['animal_birds', 'animal_dogs', 'animal_insects', 'mechanical',
       'transport_car'], dtype=object)

In [53]:
# These spectrogram settings look pretty good from a domain perspective.
# Trying a little bit higher "resolution" than previously
mel_settings = {'fmax': 8000, 'power': 2, 'n_mels' :128, 'n_fft':2048, 'hop_length':512}
fs_nom = 16000 # Nominal sampling rate. Most files should be this rate, but if not, they will be resampled
shape_nom = (128,126) # nominal spectrogram shape

In [54]:
import os
import numpy as np
import soundfile as sf
import librosa
import librosa.display
import sklearn

def force_array_shape(x, force_shape):
    """Forces a numpy array to a specific shape by filling with zeros, or truncating"""
    pad_widths = []
    for ax, ax_length in enumerate(force_shape):
        if x.shape[ax] >= ax_length:
            x = x.take(indices=range(0,ax_length), axis=ax)
        pad_widths.append((0,ax_length-x.shape[ax]))
    x = np.pad(x, pad_widths)
    return x

def get_mels(filepath='', data=[], fs=None, force_shape=None):
    if filepath:
        data, fs = librosa.load(filepath, sr=fs)
        if fs != fs_nom:
            print(filepath)
    else:
        assert (len(data>0) and fs >0), 'Must provide either a filename, or array of data and sample rate'
    
    S = librosa.feature.melspectrogram(y=data, sr = fs, **mel_settings)
    
    if force_shape and S.shape != force_shape:
        
        S = force_array_shape(S, force_shape)
            
    return S, fs
 
def load_mels(filepath, force_create=False, save=True):
    mel_path = filepath.with_suffix('.npy')
    
    if mel_path.is_file() and not force_create:
        #print('Loading {}'.format(mel_path))
        mels = np.load(mel_path)
    else:
        #print('Generating from {}'.format(filepath))
        mels, _ = get_mels(filepath, fs=fs_nom, force_shape = shape_nom)
        if save:
            #print('Saving {}'.format(mel_path))
            np.save(mel_path, mels)
    
    return mels

def feature_preprocessing(mel):
    # convert to db and normalise
    power = librosa.core.power_to_db(mel, ref=np.max)
    power = power - np.mean(power)
    power = power / (np.std(power))
    return power[:, :, None]


In [55]:
# generate the features
# note this will store all features in memory, as well as saving them to disk. 
# Can't guarantee it will work for large datasets.
manifest['features'] = manifest.apply(lambda x: data_root/x['package_hash']/x['filename'], axis=1).apply(lambda x: feature_preprocessing(load_mels(x, force_create=True, save=True)))

In [56]:
from sklearn.model_selection import train_test_split

X = np.stack(manifest['features'].values)
y = category_encoder.transform(manifest['category_set'].values)

print('Category Support')
for c,n in zip(category_encoder.classes_, y.sum(axis=0)):
    print('{:30s}{} : {}'.format(c, category_encoder.transform([[c]]), n) )

idx_list= list(range(y.shape[0]))
for i in range(y.shape[0]):
    if np.all((y[i] == 0)):
        idx_list.remove(i) 
X = X[idx_list]
y = y[idx_list]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
print(X.shape)



Category Support
animal_birds                  [[1 0 0 0 0]] : 2382
animal_dogs                   [[0 1 0 0 0]] : 2229
animal_insects                [[0 0 1 0 0]] : 677
mechanical                    [[0 0 0 1 0]] : 2099
transport_car                 [[0 0 0 0 1]] : 1251
(4652, 128, 126, 1)


## Training

In [57]:
# channel of the first convolutional layer
initial_feature = 64  
# number of labels to be categorized
num_labels = 6
input_shape = (128, 126, 1)
dense_block_config=(6, 12, 48, 32)
model = dense_net(initial_feature, num_labels, input_shape, dense_block_config)

NameError: name 'dense_net' is not defined

In [66]:
model = resnet_ldnn(5)

In [63]:
def custom_metric(y_true, y_pred):
    predictions = tf.cast(tf.greater_equal(y_pred, 0.5), tf.float32)
    pred_match = tf.equal(predictions, tf.round(y_true))
    exact_count = tf.math.reduce_min(tf.cast(pred_match, tf.float32), axis=1)
    return exact_count

In [67]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-5), metrics=[custom_metric])

# use the following for validation and training
history = model.fit(X_train, y_train, epochs=500,validation_data=(X_val, y_val))

# The following line is used for manual stopping of the network training
#history = model.fit(X_train, y_train, epochs=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500


Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500

KeyboardInterrupt: 

## Testing Stage


In [28]:
y_pred=model.predict(X_test)

In [29]:
# Convert probabilities to class labels (one-hot encoding)
y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

In [30]:
# exact match rate 
# code used from https://medium.com/analytics-vidhya/metrics-for-multi-label-classification-49cc5aeba1c3
# This function determines the rate of exact 
def emr(y_true, y_pred):
    n = len(y_true)
    row_indicators = np.all(y_true == y_pred, axis = 1) 
    exact_match_count = np.sum(row_indicators)
    return exact_match_count/n

emr(y_test, y_pred)

0.3239795918367347

In [31]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=final_categories))

              precision    recall  f1-score   support

 animal_dogs       0.64      0.77      0.70       230
animal_birds       0.74      0.73      0.74       229
  background       0.43      0.30      0.36       105

   micro avg       0.65      0.67      0.66       564
   macro avg       0.60      0.60      0.60       564
weighted avg       0.64      0.67      0.65       564
 samples avg       0.68      0.69      0.65       564



  _warn_prf(average, modifier, msg_start, len(result))
