In [145]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
import numpy as np

## DNN model 

In [146]:
# Convolutional LSTM as introduced in the report
from tensorflow import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.layers.wrappers import TimeDistributed
from keras import regularizers

def resnet_ldnn(num_label):
    model = Sequential()
    model.add(keras.applications.resnet50.ResNet50(include_top=False, input_shape=(128, 126, 1), 
                                                   weights=None, classes=None, pooling='average'))
    model.add(Permute((2, 1, 3)))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(64, dropout=0.25, return_sequences=True))
    model.add(LSTM(64, dropout=0.25))
    model.add(Dense(64))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.5))
    model.add(Dense(num_label, kernel_regularizer=regularizers.l2(0.01), activation='softmax'))
    return model

# Data Preperation

In [147]:
import pandas as pd
import pathlib
import ast

# Read data from path
data_root = pathlib.Path('classical_set\\')
# manifest will be a pandas dataframe that contains information of the audio data and its label
manifest = pd.read_csv("classical_set\\01_manifest.csv")
manifest['tag_set'] = manifest['tag_set'].apply(ast.literal_eval)
manifest['category_set'] = manifest['category_set'].apply(ast.literal_eval)

In [148]:
from sklearn.preprocessing import MultiLabelBinarizer
final_categories = ['animal_dogs', 'animal_birds', 'human_voice', 'transport_car','music','mechanical']
category_encoder = MultiLabelBinarizer().fit([final_categories])

In [149]:
manifest

Unnamed: 0,category,filename,package_hash,manual_tag,tag_set,category_set
0,animal_birds,2019-03-26 18_43_23.wav,sbfg,bi,{bi},{animal_birds}
1,animal_birds,2019-08-21 09_45_45.wav,slhg_1,bi,{bi},{animal_birds}
2,animal_birds,2019-05-18 12_18_54.wav,nbwo_1,bi,{bi},{animal_birds}
3,animal_birds,2019-09-18 12_31_41.wav,qhxt,bi,{bi},{animal_birds}
4,animal_birds,2019-08-23 11_57_58.wav,slhg_2,bi,{bi},{animal_birds}
...,...,...,...,...,...,...
295,transport_car,2019-10-10 19_22_47.wav,fudn,ca,{ca},{transport_car}
296,transport_car,2019-08-24 20_19_55.wav,rlzn,ca,{ca},{transport_car}
297,transport_car,2019-06-28 19_34_03.wav,tstr,ca,{ca},{transport_car}
298,transport_car,2021-06-12T02_02_33+0930.wav,flso,ca,{ca},{transport_car}


In [165]:
# storing audio features into array
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
import librosa
import numpy as np
# convert cateogry into intger labels
manifest.category = pd.Categorical(manifest.category)
manifest["label"] = manifest.category.cat.codes
# Total 300 recordings
num = 300
train_set = np.zeros((len(final_categories), 50, 126, 20))
for i in range(num):
    x = manifest.iloc[i]
    # load the auio path and read the audio with sampling frequency as 16000 Hz
    sig, fs = librosa.load(data_root/x['package_hash']/x['filename'], sr=16000)
    sig = sig / sig.max()
    # each category contains 50 recordings 
    # i//50 is the corresponding category of the audio
    # i%50 is the order of the recording in the 50 recrodings for the category
    train_set[i//50, i%50] = librosa.feature.mfcc(y=sig, sr=fs).T

## Classical ML method - HMM-GMM

In [235]:
# 45 for training, 5 for testing
num_train = 45
num_test = 50 - num_train
test_set = np.zeros((num_test*6, 126, 20))
test_set_label = np.zeros((num_test*6, 1))
for i in range(num_test*6):
    test_set_label[i] = i//num_test
hmm_classifier_lists = []
# for each category train a classifier using the 45 audio data in the category
for j in range(len(final_categories)):
    current_train_set = np.concatenate(train_set[j][:num_train])
    test_set[j*num_test :(j+1)*num_test] = train_set[j][num_train:]
    lengths = [126 for i in range(num_train)]
    hmm_classifier_lists.append(hmm.GaussianHMM(n_components=7,n_iter=100,covariance_type='full'))
    hmm_classifier_lists[-1].fit(current_train_set, lengths)

In [236]:
total_test_data= num_test*6
num_correct = 0
true_label = -1
# use maximum likelihood to determine the corresponding category
for i in range(num_test*6):
    if true_label != i//num_test:
        print()
        print(f"Now testing for category {final_categories[i//num_test]}")
    true_label = i//num_test
    current_max_score = np.iinfo(np.int32).min
    current_label = 0
    for j in range(len(hmm_classifier_lists)):
        # return log likelihood of each classifier
        score = hmm_classifier_lists[j].score(test_set[i])
        if score > current_max_score:
            current_max_score = score
            current_label = j
    if true_label == current_label:
        print(f"correct {final_categories[true_label]}")
        num_correct = num_correct + 1
    else:
        print(f"wrong {final_categories[true_label]} classified as {final_categories[current_label]} ")
print(f"The prediction accuracy on test set is {num_correct/total_test_data:.2f}")


Now testing for category animal_dogs
correct animal_dogs
correct animal_dogs
correct animal_dogs
wrong animal_dogs classified as animal_birds 
wrong animal_dogs classified as animal_birds 

Now testing for category animal_birds
correct animal_birds
wrong animal_birds classified as human_voice 
correct animal_birds
correct animal_birds
correct animal_birds

Now testing for category human_voice
correct human_voice
wrong human_voice classified as music 
correct human_voice
wrong human_voice classified as music 
correct human_voice

Now testing for category transport_car
wrong transport_car classified as human_voice 
correct transport_car
wrong transport_car classified as music 
wrong transport_car classified as human_voice 
wrong transport_car classified as animal_birds 

Now testing for category music
correct music
correct music
wrong music classified as human_voice 
correct music
correct music

Now testing for category mechanical
correct mechanical
correct mechanical
wrong mechanical c

In [168]:
# These spectrogram settings look pretty good from a domain perspective.
# Trying a little bit higher "resolution" than previously
mel_settings = {'fmax': 8000, 'power': 2, 'n_mels' :128, 'n_fft':2048, 'hop_length':512}
fs_nom = 16000 # Nominal sampling rate. Most files should be this rate, but if not, they will be resampled
shape_nom = (128,126) # nominal spectrogram shape

In [169]:
import os
import numpy as np
import soundfile as sf
import librosa
import librosa.display
import sklearn

def force_array_shape(x, force_shape):
    """Forces a numpy array to a specific shape by filling with zeros, or truncating"""
    pad_widths = []
    for ax, ax_length in enumerate(force_shape):
        if x.shape[ax] >= ax_length:
            x = x.take(indices=range(0,ax_length), axis=ax)
        pad_widths.append((0,ax_length-x.shape[ax]))
    x = np.pad(x, pad_widths)
    return x

def get_mels(filepath='', data=[], fs=None, force_shape=None):
    if filepath:
        data, fs = librosa.load(filepath, sr=fs)
        if fs != fs_nom:
            print(filepath)
    else:
        assert (len(data>0) and fs >0), 'Must provide either a filename, or array of data and sample rate'
    
    S = librosa.feature.melspectrogram(y=data, sr = fs, **mel_settings)
    
    if force_shape and S.shape != force_shape:
        
        S = force_array_shape(S, force_shape)
            
    return S, fs
 
def load_mels(filepath, force_create=False, save=True):
    mel_path = filepath.with_suffix('.npy')
    
    if mel_path.is_file() and not force_create:
        #print('Loading {}'.format(mel_path))
        mels = np.load(mel_path)
    else:
        #print('Generating from {}'.format(filepath))
        mels, _ = get_mels(filepath, fs=fs_nom, force_shape = shape_nom)
        if save:
            #print('Saving {}'.format(mel_path))
            np.save(mel_path, mels)
    
    return mels

def feature_preprocessing(mel):
    # convert to db and normalise
    power = librosa.core.power_to_db(mel, ref=np.max)
    power = power - np.mean(power)
    power = power / (np.std(power))
    return power[:, :, None]


In [170]:
# generate the features
# note this will store all features in memory, as well as saving them to disk. 
# Can't guarantee it will work for large datasets.
manifest['features'] = manifest.apply(lambda x: data_root/x['package_hash']/x['filename'], 
                                      axis=1).apply(lambda x: feature_preprocessing(load_mels(x, force_create=True, save=True)))

In [220]:
from sklearn.model_selection import train_test_split

X = np.stack(manifest['features'].values)
y = category_encoder.transform(manifest['category_set'].values)

print('Category Support')
for c,n in zip(category_encoder.classes_, y.sum(axis=0)):
    print('{:30s}{} : {}'.format(c, category_encoder.transform([[c]]), n) )

idx_list= list(range(y.shape[0]))
for i in range(y.shape[0]):
    if np.all((y[i] == 0)):
        idx_list.remove(i) 
X = X[idx_list]
y = y[idx_list]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=42)
print(X_train.shape)

Category Support
animal_birds                  [[1 0 0 0 0 0]] : 50
animal_dogs                   [[0 1 0 0 0 0]] : 50
human_voice                   [[0 0 1 0 0 0]] : 50
mechanical                    [[0 0 0 1 0 0]] : 50
music                         [[0 0 0 0 1 0]] : 50
transport_car                 [[0 0 0 0 0 1]] : 50
(270, 128, 126, 1)


## Training

In [214]:
model = resnet_ldnn(6)

In [215]:
def custom_metric(y_true, y_pred):
    predictions = tf.cast(tf.greater_equal(y_pred, 0.5), tf.float32)
    pred_match = tf.equal(predictions, tf.round(y_true))
    exact_count = tf.math.reduce_min(tf.cast(pred_match, tf.float32), axis=1)
    return exact_count

In [227]:
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-5), metrics=['accuracy'])

# use the following for validation and training
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Testing Stage


In [228]:
y_pred=model.predict(X_test)
y_pred=y_pred.argmax(axis=-1)
y_pred

array([3, 3, 0, 0, 4, 2, 3, 2, 5, 3, 3, 1, 2, 0, 3, 4, 5, 2, 2, 5, 1, 3,
       2, 4, 0, 3, 2, 5, 3, 2], dtype=int64)

In [229]:
y_test_true = np.where(y_test==1)[1]
y_test_true

array([4, 5, 3, 0, 4, 4, 3, 2, 0, 3, 4, 1, 4, 0, 3, 4, 5, 4, 2, 3, 1, 2,
       4, 5, 2, 0, 5, 5, 3, 4], dtype=int64)

In [232]:
count = 0
for i in range(len(y_pred)):
    if y_pred[i] == y_test_true[i]:
        count = count + 1
print(f"The prediction accuracy for the CNN is {count/len(y_pred):.2f}")

The prediction accuracy for the CNN is 0.47


In [231]:
from sklearn.metrics import classification_report

print(classification_report(y_test_true, y_pred, target_names=final_categories))

               precision    recall  f1-score   support

  animal_dogs       0.50      0.50      0.50         4
 animal_birds       1.00      1.00      1.00         2
  human_voice       0.25      0.50      0.33         4
transport_car       0.44      0.67      0.53         6
        music       0.67      0.22      0.33         9
   mechanical       0.50      0.40      0.44         5

     accuracy                           0.47        30
    macro avg       0.56      0.55      0.52        30
 weighted avg       0.54      0.47      0.46        30

