In [1]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
import numpy as np

# DNN model 1 (Densenet)

Below is our own implementation of densenet, following the tutorial in
https://amaarora.github.io/2020/08/02/densenets.html

In [60]:
def dense_net(initial_feature, num_label, input_shape, 
              dense_block_config, drop_out = 0.2, bottle_necksz=4, growth_rate=32):
    
    def dense_block(input_layer, num_sets, bottle_necksz, growth_rate):
        layer_sets = [input_layer]
        for i in range(num_sets):
            if i > 0:
                input_layer = keras.layers.Concatenate()(layer_sets)
                layer_sets = []
                layer_sets.append(input_layer)
            bottleneck_1 = keras.layers.BatchNormalization()(input_layer)
            activation_1 = keras.layers.ReLU()(bottleneck_1)
            convolution_1 = keras.layers.Conv2D(bottle_necksz*growth_rate,
                                                kernel_size=(1,1), strides=1, use_bias=False)(activation_1)
            bottleneck_2 = keras.layers.BatchNormalization()(convolution_1)
            activation_2 =  keras.layers.ReLU()(bottleneck_2)
            convolution_2 = keras.layers.Conv2D(growth_rate, kernel_size=(3,3), 
                                                strides=1, padding='same', use_bias=False)(activation_2)
            layer_sets.append(convolution_2)
        return keras.layers.Concatenate()(layer_sets)

    def transition_layer(input_layer):
        batch_norm = keras.layers.BatchNormalization()(input_layer)
        activation = keras.layers.ReLU()(batch_norm)
        feature_size = keras.backend.int_shape(activation)[3]
        conv = keras.layers.Conv2D(feature_size//2, kernel_size=(1,1),strides=1,use_bias=False)(activation)
        pool = keras.layers.AveragePooling2D()(conv)
        return pool

    def fully_connected_layer(input_layer, num_labels):
        pool = keras.layers.GlobalAveragePooling2D()(input_layer)
        norm_1 = keras.layers.BatchNormalization()(pool)
        dropout = keras.layers.Dropout(.2)(norm_1)
        dense_1 = keras.layers.Dense(1024, activation='relu')(dropout)
        dense_2 = keras.layers.Dense(512, activation='relu')(dense_1)
        norm_2 = keras.layers.BatchNormalization()(dense_2)
        dropout_2 = keras.layers.Dropout(.2)(norm_2)
        return keras.layers.Dense(num_labels, activation='softmax')(dropout_2)

    inputs = keras.Input(shape = input_shape)
    # initial transition layers
    initial_padding_1 = keras.layers.ZeroPadding2D(padding=(3,3))(inputs)
    initial_conv = keras.layers.Conv2D(initial_feature, kernel_size=(7,7), 
                                       strides=2, use_bias=False)(initial_padding_1)
    initial_norm = keras.layers.BatchNormalization()(initial_conv)
    initial_relu = keras.layers.ReLU()(initial_norm)
    initial_padding_2 = keras.layers.ZeroPadding2D(padding=(1,1))(initial_relu)
    initial = keras.layers.MaxPooling2D(pool_size=(3,3), strides=2)(initial_padding_2)
    
    for num in dense_block_config:
        conv = dense_block(initial, num, bottle_necksz, growth_rate)
        initial = transition_layer(conv)

    outputs = fully_connected_layer(initial, num_label)
    return keras.Model(inputs=inputs, outputs=outputs)

## DNN model 2 (Convolutional LSTM)
Below is the Convolutional LSTM used in https://github.com/WWH98932/Audio-Classification-Models

ResNet50 is discussed in the project report. The original paper for resnet is at https://arxiv.org/pdf/1512.03385.pdf

In [36]:
from tensorflow import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.layers.wrappers import TimeDistributed
from keras import regularizers

def resnet_ldnn(num_label):
    model = Sequential()
    model.add(keras.applications.resnet50.ResNet50(include_top=False, input_shape=(128, 126, 1), 
                                                   weights=None, classes=None, pooling='average'))
    model.add(Permute((2, 1, 3)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(64, dropout=0.25, return_sequences=True))
    model.add(LSTM(64, dropout=0.25))
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(num_label, kernel_regularizer=regularizers.l2(0.01), activation='sigmoid'))
    return model

# Data Preperation

In [3]:
import pandas as pd
import pathlib
import ast

# Read data from path
data_root = pathlib.Path('sample_set\\')
manifest = pd.read_csv("sample_set\\01_manifest.csv")
manifest['tag_set'] = manifest['tag_set'].apply(ast.literal_eval)
manifest['category_set'] = manifest['category_set'].apply(ast.literal_eval)

In [49]:
from sklearn.preprocessing import MultiLabelBinarizer
final_categories = ['animal_dogs', 'animal_birds', 'human_voice', 'transport_car','background','mechanical']
category_encoder = MultiLabelBinarizer().fit([final_categories])
category_encoder.classes_

array(['animal_birds', 'animal_dogs', 'background', 'human_voice',
       'mechanical', 'transport_car'], dtype=object)

In [50]:
# These spectrogram settings look pretty good from a domain perspective.
# Trying a little bit higher "resolution" than previously
mel_settings = {'fmax': 8000, 'power': 2, 'n_mels' :128, 'n_fft':2048, 'hop_length':512}
fs_nom = 16000 # Nominal sampling rate. Most files should be this rate, but if not, they will be resampled
shape_nom = (128,126) # nominal spectrogram shape

In [46]:
import os
import numpy as np
import soundfile as sf
import librosa
import librosa.display
import sklearn

def force_array_shape(x, force_shape):
    """Forces a numpy array to a specific shape by filling with zeros, or truncating"""
    pad_widths = []
    for ax, ax_length in enumerate(force_shape):
        if x.shape[ax] >= ax_length:
            x = x.take(indices=range(0,ax_length), axis=ax)
        pad_widths.append((0,ax_length-x.shape[ax]))
    x = np.pad(x, pad_widths)
    return x

def get_mels(filepath='', data=[], fs=None, force_shape=None):
    if filepath:
        data, fs = librosa.load(filepath, sr=fs)
        if fs != fs_nom:
            print(filepath)
    else:
        assert (len(data>0) and fs >0), 'Must provide either a filename, or array of data and sample rate'
    
    S = librosa.feature.melspectrogram(y=data, sr = fs, **mel_settings)
    
    if force_shape and S.shape != force_shape:
        
        S = force_array_shape(S, force_shape)
            
    return S, fs
 
def load_mels(filepath, force_create=False, save=True):
    mel_path = filepath.with_suffix('.npy')
    
    if mel_path.is_file() and not force_create:
        #print('Loading {}'.format(mel_path))
        mels = np.load(mel_path)
    else:
        #print('Generating from {}'.format(filepath))
        mels, _ = get_mels(filepath, fs=fs_nom, force_shape = shape_nom)
        if save:
            #print('Saving {}'.format(mel_path))
            np.save(mel_path, mels)
    
    return mels

def feature_preprocessing(mel):
    # convert to db and normalise
    power = librosa.core.power_to_db(mel, ref=np.max)
    power = power - np.mean(power)
    power = power / (np.std(power))
    return power[:, :, None]


In [47]:
# generate the features
# note this will store all features in memory, as well as saving them to disk. 
# Can't guarantee it will work for large datasets.
manifest['features'] = manifest.apply(lambda x: data_root/x['package_hash']/x['filename'], axis=1).apply(lambda x: feature_preprocessing(load_mels(x, force_create=True, save=True)))

In [55]:
from sklearn.model_selection import train_test_split

X = np.stack(manifest['features'].values)
y = category_encoder.transform(manifest['category_set'].values)

print('Category Support')
for c,n in zip(category_encoder.classes_, y.sum(axis=0)):
    print('{:30s}{} : {}'.format(c, category_encoder.transform([[c]]), n) )

idx_list= list(range(y.shape[0]))
for i in range(y.shape[0]):
    if np.all((y[i] == 0)):
        idx_list.remove(i) 
X = X[idx_list]
y = y[idx_list]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)
print(X_train.shape)

Category Support
animal_birds                  [[1 0 0 0 0 0]] : 2382
animal_dogs                   [[0 1 0 0 0 0]] : 2229
background                    [[0 0 1 0 0 0]] : 915
human_voice                   [[0 0 0 1 0 0]] : 964
mechanical                    [[0 0 0 0 1 0]] : 2099
transport_car                 [[0 0 0 0 0 1]] : 1251
(4379, 128, 126, 1)


## Training

In [63]:
# channel of the first convolutional layer
initial_feature = 64  
# number of labels to be categorized
num_labels = 6
input_shape = (128, 126, 1)
dense_block_config=(6, 12, 24, 16)
model = dense_net(initial_feature, num_labels, input_shape, dense_block_config)

In [71]:
model = resnet_ldnn(6)

In [77]:
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-5))

# use the following for validation and training
# history = model.fit(X_train, y_train, epochs=500, validation_data=(X_val, y_val))

# The following line is used for manual stopping of the network training
history = model.fit(X_train, y_train, epochs=1)



## Testing Stage


In [78]:
y_pred=model.predict(X_test)

In [79]:
# Convert probabilities to class labels (one-hot encoding)
y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

In [80]:
# exact match rate 
# code used from https://medium.com/analytics-vidhya/metrics-for-multi-label-classification-49cc5aeba1c3
# This function determines the rate of exact 
def emr(y_true, y_pred):
    n = len(y_true)
    row_indicators = np.all(y_true == y_pred, axis = 1) 
    exact_match_count = np.sum(row_indicators)
    return exact_match_count/n

emr(y_test, y_pred)

0.8747433264887063

In [81]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=final_categories))

               precision    recall  f1-score   support

  animal_dogs       0.93      0.95      0.94       250
 animal_birds       0.97      0.96      0.96       226
  human_voice       0.96      0.91      0.93       100
transport_car       0.99      0.84      0.91        99
   background       0.93      0.96      0.95       210
   mechanical       0.95      0.85      0.90       124

    micro avg       0.95      0.93      0.94      1009
    macro avg       0.95      0.91      0.93      1009
 weighted avg       0.95      0.93      0.94      1009
  samples avg       0.94      0.93      0.93      1009

