# Preparing the dataset

In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Saving the Spectrogram of a single audio file
def save_spectrogram(curr_audio_path, curr_audio_name):
    X, sr = librosa.load(curr_audio_path)  # librosa.load() returns an np array and sampling rate(by default 22050)
    plt.specgram(X, Fs=22050)
    plt.gca().axes.get_yaxis().set_visible(False)
    plt.gca().axes.get_xaxis().set_visible(False)
    plt.plot
    plt.savefig('spectrograms/' + curr_audio_name,  bbox_inches= 'tight' , pad_inches = 0, dpi = 25)

In [3]:
from keras.preprocessing import image
from PIL import Image

rootdir = 'spectrograms/'

X = []
Y = []

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        curr_img_path = os.path.join(subdir, file)  # The path of current image file
        curr_img_path = os.path.normpath(curr_img_path)  # To get '\' instead of '/'
        curr_img_name = os.path.splitext(file)[0]   # The name of current image file (withoud .png extension)
        img = image.load_img(curr_img_path, target_size = (64, 64))  # Load the actual image file
        img = image.img_to_array(img)        # Convert the loaded image file to the array
        cls = int(curr_img_name.split('-')[1])
        X.append(img)
        Y.append(cls)
print(len(X), len(Y))
print(X[0].shape)

Using TensorFlow backend.


8732 8732
(64, 64, 3)


In [4]:
from keras.utils import to_categorical
X = np.array(X)
Y = np.array(Y)
Y = to_categorical(Y)   # One hot encoding
X.shape, type(X), Y.shape, type(Y), (Y[0])

((8732, 64, 64, 3),
 numpy.ndarray,
 (8732, 10),
 numpy.ndarray,
 array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32))

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(6549, 64, 64, 3) (6549, 10)
(2183, 64, 64, 3) (2183, 10)


# Training the CNN

In [6]:
num_classes = 10
input_width = 64
input_height = 64
input_channels = 3
input_shape = (input_width, input_height, input_channels)

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras import optimizers

In [8]:
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same', activation='relu',
                 input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(512, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizers.Adam(lr=0.0005),loss="categorical_crossentropy",metrics=["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 64, 64, 32)        896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 32, 32)        9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 16, 16, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 16, 16, 64)        18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 8, 8, 64)          0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 8, 8, 64)         

In [None]:
model.fit(X_train, Y_train, epochs=50, validation_data=(X_test, Y_test))

In [19]:
score = model.evaluate(X_test, Y_test)
print(score)

[0.7213552639816695, 0.8405863642692566]


In [33]:
# Save the model, so that we can use this trained model later also
model.save('saved_models/UrbanSoundCompleteCNNAdam.h5')

In [24]:
Y_test_pred = model.predict(X_test)
Y_train_pred = model.predict(X_train)
y_test_pred = np.argmax(Y_test_pred, axis=1)
y_test = np.argmax(Y_test, axis=1)
y_train_pred = np.argmax(Y_train_pred, axis=1)
y_train = np.argmax(Y_train, axis=1)

In [28]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [29]:
print(confusion_matrix(y_test, y_test_pred))

[[227   0   0   4   2   3   0   3   3   2]
 [  0  77   6   6   3   2   1   0   1   8]
 [  6   1 188  14   5  10   0   1   2  32]
 [  3   2  16 177   2   4   2   1   5  12]
 [  0   2   2  11 209   3   4  17   0   8]
 [ 12   1   9   3   1 230   1   2   1   6]
 [  1   0   0   1   1   0 100   1   0   3]
 [  2   1   1   1   3   0   0 220   0   8]
 [  5   0   2   3   1   2   0   0 223  16]
 [  9   9  13   5   4   4   0   2   5 184]]


In [30]:
print(confusion_matrix(y_train, y_train_pred))

[[756   0   0   0   0   0   0   0   0   0]
 [  0 324   0   0   0   0   0   0   0   1]
 [  1   0 735   0   0   0   0   0   0   5]
 [  1   1   0 774   0   0   0   0   0   0]
 [  0   0   1   0 738   0   0   5   0   0]
 [  7   0   2   1   0 723   0   1   0   0]
 [  0   0   0   0   0   0 267   0   0   0]
 [  0   0   0   0   0   0   0 764   0   0]
 [  0   0   0   0   0   0   0   0 676   1]
 [  5   0   0   0   0   0   0   2   0 758]]


In [31]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.86      0.93      0.89       244
           1       0.83      0.74      0.78       104
           2       0.79      0.73      0.76       259
           3       0.79      0.79      0.79       224
           4       0.90      0.82      0.86       256
           5       0.89      0.86      0.88       266
           6       0.93      0.93      0.93       107
           7       0.89      0.93      0.91       236
           8       0.93      0.88      0.91       252
           9       0.66      0.78      0.72       235

    accuracy                           0.84      2183
   macro avg       0.85      0.84      0.84      2183
weighted avg       0.84      0.84      0.84      2183



In [32]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       756
           1       1.00      1.00      1.00       325
           2       1.00      0.99      0.99       741
           3       1.00      1.00      1.00       776
           4       1.00      0.99      1.00       744
           5       1.00      0.99      0.99       734
           6       1.00      1.00      1.00       267
           7       0.99      1.00      0.99       764
           8       1.00      1.00      1.00       677
           9       0.99      0.99      0.99       765

    accuracy                           0.99      6549
   macro avg       1.00      1.00      1.00      6549
weighted avg       0.99      0.99      0.99      6549

