# Preparing the dataset

In [1]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Saving the Spectrogram of a single audio file
def save_spectrogram(curr_audio_path, curr_audio_name):
    X, sr = librosa.load(curr_audio_path)  # librosa.load() returns an np array and sampling rate(by default 22050)
    plt.specgram(X, Fs=22050)
    plt.gca().axes.get_yaxis().set_visible(False)
    plt.gca().axes.get_xaxis().set_visible(False)
    plt.plot
    plt.savefig('spectrograms/' + curr_audio_name,  bbox_inches= 'tight' , pad_inches = 0, dpi = 25)

In [None]:
from keras.preprocessing import image
from IPython.display import display
from PIL import Image

rootdir = 'spectrograms/'

X = []
Y = []

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        curr_img_path = os.path.join(subdir, file)  # The path of current image file
        curr_img_path = os.path.normpath(curr_img_path)  # To get '\' instead of '/'
        curr_img_name = os.path.splitext(file)[0]   # The name of current image file (withoud .png extension)
        img = image.load_img(curr_img_path, target_size = (64, 64))  # Load the actual image file
        img = image.img_to_array(img)        # Convert the loaded image file to the array
        cls = int(curr_img_name.split('-')[1])
        X.append(img)
        Y.append(cls)
print(len(X), len(Y))
print(X[0].shape)

In [None]:
from keras.utils import to_categorical
X = np.array(X)
Y = np.array(Y)
Y = to_categorical(Y)   # One hot encoding
X.shape, type(X), Y.shape, type(Y), (Y[0])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

# Training the CNN

In [6]:
num_classes = 10
input_width = 64
input_height = 64
input_channels = 3
input_shape = (input_width, input_height, input_channels)

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras import optimizers

In [8]:
model = Sequential()

In [9]:
model.add(Conv2D(32, (3, 3),
                 activation='relu', padding='same',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

In [10]:
model.compile(optimizers.rmsprop(lr=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs=50, validation_data=(X_test, Y_test))

In [12]:
score = model.evaluate(X_test, Y_test)
score



[2.8971323836491147, 0.8256029486656189]

In [None]:
# Save the model, so that we can use this trained model later also
model.save('saved_models/UrbanSoundComplete.h5')

In [14]:
class_names = ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']

In [None]:
#Testing on an audio file
wav_path = 'testAudio0.wav'
wav_name = os.path.splitext(wav_path)[0]
save_spectrogram(wav_path, wav_name)
png_path = 'spectrograms/' + wav_name + '.png'
png_img = image.load_img(png_path, target_size = (64, 64))
x = image.img_to_array(png_img)
x = np.expand_dims(x, axis = 0)
pred = model.predict(x)
class_idx = np.argmax(pred[0])
predicted_class = class_names[class_idx]
print(predicted_class)