### CNN Model for Audio Tagging

In [3]:
import numpy
import os
# import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# import seaborn as sns
from tensorflow.keras.models import load_model

In [4]:
# Initialize all path variables
dir_path = os.getcwd()
x_data = os.path.join(dir_path, "X")
y_data = os.path.join(dir_path, "Y")
mfcc_data = os.path.join(dir_path, "MFCC")

### Preprocessing Data

In [None]:
# PARAMETER FOR STFT
# SAMPLING_RATE=16000
# DURATION=10
# N_FFT = 1024
# WIN_LENGTH = 1024
# HOP_LENGTH = 512

In [None]:
for file in os.listdir(x_data):
    mel_spectrogram = numpy.load(x_data+'/'+file)
    print(file)
    mfcc = librosa.feature.mfcc(S=mel_spectrogram, sr=16000,n_mfcc=20)
    mean = numpy.average(mfcc)
    std = numpy.std(mfcc)
    if(std == 0):
      std = std + 1e-25
    mfcc = (mfcc - mean)/std
    path = os.path.join(mfcc_data, file)
    numpy.save(path, mfcc)

In [None]:
x = os.listdir(x_data)
x = sorted(set(x))

In [None]:
#List containing all spectrograms
mfccs=[] 
for file in x:
    arr = numpy.load(os.path.join(mfcc_data, file))
    m,n,o = arr.shape
    arr = arr.reshape(n,o)
    mat = arr.reshape((arr.shape[0], arr.shape[1], 1))
    mfccs.append(tf.convert_to_tensor(mat))
mfccs = numpy.array(mfccs)

In [None]:
def eventroll_to_multihot_vector(eventroll):
    """
    Parameters
    ----------
    eventroll : np.array
        Eventroll matrix of shape=(11, 1000).
    
    Returns
    -------
    np.array
        A multihot vector of shape=(10,)
    """
    
    # findout active events:
    active_events = (eventroll.sum(axis=1) >= 0.5).astype('float')
    
    # remove silence class:
    return numpy.delete(active_events, 8)

In [None]:
y_list = os.listdir(y_data)
y_list = sorted(set(y_list))

In [None]:
#List containing all spectrograms
y = [] 
for file in y_list:
    arr = numpy.load(os.path.join(y_data, file))
    y.append(eventroll_to_multihot_vector(arr))
y = numpy.array(y)

#### Load the required data

In [None]:
X=mfccs[0:10000]
y=y[0:10000]
# used 'random_state' of 40 while splitting to get the balanced split of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40, shuffle="true")

In [None]:
# get the count of classes present in y_test
# print(type(y_test))
# val_label=pd.Series(list(y_test))
# val_label.value_counts()

In [None]:
pool_size = (2, 2)
kernel_size = (3, 3)
input_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
num_classes = 10

print(X_train.shape)

(8000, 64, 1000, 1)


In [None]:
'''10%'''
# model = tf.keras.models.Sequential([
#     #first_convolution
#     tf.keras.layers.Conv2D(32, kernel_size,
#                 padding="same", input_shape=input_shape),
#     tf.keras.layers.BatchNormalization(),
#     tf.keras.layers.Activation('relu'),
#     tf.keras.layers.MaxPooling2D(2, 2),
#     # tf.keras.layers.Dropout(0.25),
#     #second_convolution
#     tf.keras.layers.Conv2D(64, kernel_size,
#                                   padding="same"),
#     tf.keras.layers.BatchNormalization(),
#     tf.keras.layers.Activation('relu'),
#     tf.keras.layers.MaxPooling2D(2,2),
# #     tf.keras.layers.Dropout(0.25),
#     #third_convolution
#     tf.keras.layers.Conv2D(128, kernel_size,
#                                   padding="same"),
#     tf.keras.layers.BatchNormalization(),
#     tf.keras.layers.Activation('relu'),
#     tf.keras.layers.MaxPooling2D(2,2),
#     tf.keras.layers.Dropout(0.25),
#     #fourth_convolution
#     tf.keras.layers.Conv2D(128, kernel_size,
#                                   padding="same"),
#     tf.keras.layers.BatchNormalization(),
#     tf.keras.layers.Activation('relu'),
#     tf.keras.layers.GlobalMaxPooling2D(),
#     tf.keras.layers.Dropout(0.25),
#     #Fully connected 1st layer
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(256, activation="relu"),
#     tf.keras.layers.Dense(128, activation="relu"),
#     tf.keras.layers.Dense(10, activation="sigmoid") 
# ]) 

'10%'

https://publications.lib.chalmers.se/records/fulltext/255604/255604.pdf

In [None]:
model = tf.keras.models.Sequential([
    #first_convolution
    tf.keras.layers.Conv2D(32, kernel_size,
                padding="same", input_shape=input_shape),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    # tf.keras.layers.Dropout(0.25),
    #second_convolution
    tf.keras.layers.Conv2D(128, kernel_size,
                                  padding="same"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPooling2D(2,2),
#     tf.keras.layers.Dropout(0.25),
    #third_convolution
    tf.keras.layers.Conv2D(128, kernel_size,
                                  padding="same"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    # tf.keras.layers.Dropout(0.25),
    #fifth_convolution
    tf.keras.layers.Conv2D(256, kernel_size,
                                  padding="same"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation('relu'),
    tf.keras.layers.GlobalMaxPooling2D(),
    # tf.keras.layers.Dropout(0.25),
    #Fully connected 1st layer
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1024, activation="relu"),
    tf.keras.layers.Dense(512, activation="relu"),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(10, activation="sigmoid") 
]) 

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_14 (Conv2D)          (None, 64, 1000, 32)      320       
                                                                 
 batch_normalization_14 (Bat  (None, 64, 1000, 32)     128       
 chNormalization)                                                
                                                                 
 activation_14 (Activation)  (None, 64, 1000, 32)      0         
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 32, 500, 32)      0         
 2D)                                                             
                                                                 
 conv2d_15 (Conv2D)          (None, 32, 500, 128)      36992     
                                                                 
 batch_normalization_15 (Bat  (None, 32, 500, 128)    

https://www.kaggle.com/code/kmkarakaya/multi-label-model-evaluation/notebook

In [None]:
# # compile the model using Adam optimizer
# model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
#           loss=tf.keras.losses.BinaryCrossentropy(),
#           metrics=["accuracy"])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# To keep track of the best metrices obtained while training the model
from keras.callbacks import ModelCheckpoint

filepath = '/content/drive/MyDrive/Audio_Classification-MLSP_Assignment-2/my_best_model_cnn.hdf5'
checkpoint = ModelCheckpoint(filepath=filepath, monitor='accuracy', verbose=1, save_best_only=True, mode='max')

In [None]:
history=model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=10, verbose = 1, callbacks=[checkpoint])

Epoch 1/10
Epoch 1: accuracy improved from -inf to 0.11350, saving model to /content/drive/MyDrive/Audio_Classification-MLSP_Assignment-2/my_best_model_cnn.hdf5
Epoch 2/10
Epoch 2: accuracy improved from 0.11350 to 0.11425, saving model to /content/drive/MyDrive/Audio_Classification-MLSP_Assignment-2/my_best_model_cnn.hdf5
Epoch 3/10
Epoch 3: accuracy did not improve from 0.11425
Epoch 4/10
Epoch 4: accuracy did not improve from 0.11425
Epoch 5/10
Epoch 5: accuracy did not improve from 0.11425
Epoch 6/10
Epoch 6: accuracy did not improve from 0.11425
Epoch 7/10
Epoch 7: accuracy did not improve from 0.11425
Epoch 8/10
Epoch 8: accuracy did not improve from 0.11425
Epoch 9/10
Epoch 9: accuracy did not improve from 0.11425
Epoch 10/10
Epoch 10: accuracy did not improve from 0.11425


In [None]:
from matplotlib import pyplot as plt
model.metrics_names
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

accuracy = model.evaluate(X_test, y_test)
print('n', 'Test_Accuracy:-', accuracy[1])
pred=model.predict(X_test)
y_pred = pred
y_true = y_test

print('confusion matrix')
print(multilabel_confusion_matrix(numpy.array(y_true), numpy.array(y_pred)))

# f, ax = plt.subplots(figsize=(8,5))
# sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt=".0f", ax=ax)
# plt.xlabel("y_pred")
# plt.ylabel("y_true")
# plt.show()

In [None]:
print(y_pred[0])
print(y_true[0])

In [None]:
model = load_model(filepath, custom_objects={'f1_m':f1_m, 'precision_m':precision_m, 'recall_m':recall_m})