In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip "drive/MyDrive/audio_dataset.zip"

In [11]:
!rm -r "audio_dataset/train/Laughter/.ipynb_checkpoints"

In [2]:
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import os
!pip install pydub
from pydub import AudioSegment



In [3]:
def conv_m4a(input_file):
    with open(input_file, 'rb') as file:
        header = file.read(20)
        if b'\x4D\x34\x41\x20' in header:
            sound = AudioSegment.from_file(input_file, format='m4a')
            out_file = input_file.split('.')[0] + '.wav'
            os.remove(input_file)
            file_handle = sound.export(out_file, format='wav')
            return out_file

        return input_file

In [4]:
class conf:
    sampling_rate = 30000
    duration = 2
    hop_length = 347*duration
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 128
    n_fft = n_mels * 20
    samples = sampling_rate * duration

In [5]:
def create_spectrogram(audio_file, image_file):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1)

    y, sr = librosa.load(audio_file, sr = conf.sampling_rate)

    if len(y) > conf.samples:
        y = y[0:0+conf.samples]
    else:
        padding = conf.samples - len(y)
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')

    ms = librosa.feature.melspectrogram(y=y, sr=conf.sampling_rate, n_mels=conf.n_mels,
                                        hop_length=conf.hop_length, n_fft=conf.n_fft,
                                        fmin=conf.fmin, fmax=conf.fmax)
    log_ms = librosa.power_to_db(ms, ref=np.max)
    librosa.display.specshow(log_ms, sr=conf.sampling_rate)
    log_ms = log_ms.astype(np.float32)
    fig.savefig(image_file)
    plt.close(fig)

def create_pngs_from_wavs(input_path, output_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    dir = os.listdir(input_path)

    for i, file in enumerate(dir):
        input_file = os.path.join(input_path, file)
        input_file = conv_m4a(input_file)
        output_file = os.path.join(output_path, file+'.png')
        create_spectrogram(input_file, output_file)
        print(file)

In [None]:
sounds = ['Laughter', 'car_horn', 'dog_barking', 'drilling', 'Fart', 'Guitar', 'Gunshot_and_gunfire', 'Hi-hat', 'Knock',
          'Shatter', 'siren', 'Snare_drum', 'Splash_and_splatter']

for sound in sounds:
    input_path = f'audio_dataset/train/{sound}'
    output_path = f'spectrograms/train/{sound}'
    create_pngs_from_wavs(input_path, output_path)
    print('Finished ', sound)

In [None]:
sounds = ['Laughter', 'car_horn', 'dog_barking', 'drilling', 'Fart', 'Guitar', 'Gunshot_and_gunfire', 'Hi-hat', 'Knock',
          'Shatter', 'siren', 'Snare_drum', 'Splash_and_splatter']

for sound in sounds:
    input_path = f'audio_dataset/val/{sound}'
    output_path = f'spectrograms/val/{sound}'
    create_pngs_from_wavs(input_path, output_path)
    print('Finished ', sound)

In [None]:
from keras.preprocessing import image

def load_images_from_path(path, label):
    images = []
    labels = []

    for file in os.listdir(path):
        images.append(image.img_to_array(image.load_img(os.path.join(path, file), target_size=(224, 224, 3))))
        labels.append((label))

    return images, labels

def show_images(images):
    fig, axes = plt.subplots(1, 8, figsize=(20, 20), subplot_kw={'xticks': [], 'yticks': []})

    for i, ax in enumerate(axes.flat):
        ax.imshow(images[i] / 255)

In [None]:
x_train = []
y_train = []
x_test = []
y_test = []

for sound in sounds:
    images, labels = load_images_from_path(f'spectrograms/train/{sound}', sounds.index(sound))
    x_train += images
    y_train += labels

for sound in sounds:
    images, labels = load_images_from_path(f'spectrograms/val/{sound}', sounds.index(sound))
    x_test += images
    y_test += labels

In [None]:
from tensorflow.keras.utils import to_categorical

y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

In [None]:
from keras.models import Sequential
from tensorflow.keras import layers
from keras.layers import Conv2D, MaxPooling2D, Activation, Flatten, Dense, BatchNormalization, Dropout, Input
import keras.regularizers as regularizers
import keras.initializers as initializers

In [None]:
def conv_block(num_channels):
    block = Sequential([
        Conv2D(num_channels, (3, 3), padding='same'),
        BatchNormalization(),
        Activation('relu')
    ])
    return block

def identity_block(num_channels):
    block = Sequential([
        conv_block(num_channels),
        Conv2D(num_channels, (3, 3), padding='same'),
        BatchNormalization(),
        Activation('relu')
    ])
    return block

def res_block(num_channels, conv_first=False):
    block = Sequential()
    if conv_first:
        block.add(Conv2D(num_channels, (1, 1), strides=(2, 2)))
    block.add(conv_block(num_channels))
    block.add(Conv2D(num_channels, (3, 3), padding='same'))
    block.add(BatchNormalization())
    return block

In [None]:
model = Sequential([
            Input(shape=(224, 224, 3)),
            Conv2D(64, (7, 7), strides=(2, 2), padding='same'),
            BatchNormalization(),
            Activation('relu'),
            MaxPooling2D((3, 3), strides=(2, 2), padding='same')
        ])
num_channels = 64
for i in range(3):
    model.add(res_block(num_channels, conv_first=(i==0)))
    num_channels *= 2
for i in range(4):
    model.add(identity_block(num_channels))
for i in range(6):
    model.add(identity_block(num_channels*2))
for i in range(3):
    model.add(identity_block(num_channels*4))
model.add(layers.GlobalAveragePooling2D())
model.add(Dense(13, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
x_train = np.array(x_train)
x_test = np.array(x_test)

hist = model.fit(x_train, y_train_encoded, validation_data=(x_test, y_test_encoded), batch_size=10, epochs=15)

In [None]:
acc = hist.history['accuracy']
val_acc = hist.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, '-', label='Training Accuracy')
plt.plot(epochs, val_acc, ':', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.plot()

In [None]:
prediction = model.predict(np.array(x_test))

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import seaborn as sns
sns.set_theme()

mat = confusion_matrix(y_test_encoded.argmax(axis=1), prediction.argmax(axis=1))
class_labels = ['background', 'chainsaw', 'engine', 'storm']

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='Blues',
            xticklabels=sounds,
            yticklabels=sounds)

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

In [None]:
precision_recall_fscore_support(y_test_encoded.argmax(axis=1), prediction.argmax(axis=1))