In [57]:
import os
import glob
import librosa
from tqdm import tqdm
import joblib
import pandas as pd
import numpy as np
import keras

In [45]:
mfcc_dict = {}
sound_path = os.path.join("..", "data", "sounds")
max_sound_length = 173

for file in tqdm(glob.glob(os.path.join(sound_path, "*.wav"))):
    sound_id = file[len(sound_path + "/"):-len(".wav")]
    mfcc = librosa.feature.mfcc(*librosa.load(file))
    mfcc = librosa.util.fix_length(mfcc, max_sound_length)
    mfcc_dict[int(sound_id)] = mfcc

100%|██████████| 3670/3670 [10:11<00:00,  6.01it/s]


In [47]:
mfcc_dict[666]

array([[ -2.42579995e+02,  -2.49812213e+02,  -2.67513318e+02, ...,
         -2.67675760e+02,  -2.57069240e+02,  -2.28439155e+02],
       [  1.32293902e+02,   1.54785994e+02,   1.76246104e+02, ...,
          1.78335009e+02,   1.65681229e+02,   1.29929888e+02],
       [  7.60230122e+01,   6.65809508e+01,   4.24261387e+01, ...,
          4.10531854e+01,   5.22641242e+01,   5.11131522e+01],
       ..., 
       [ -2.85158617e+00,  -5.29948492e+00,  -1.02560000e+01, ...,
         -3.13509516e+00,  -7.69428484e+00,  -9.87785389e+00],
       [ -1.64310428e+00,  -4.57154473e+00,  -9.01730536e+00, ...,
         -1.35508123e+00,  -6.15971884e+00,  -8.64188705e+00],
       [ -3.89875712e+00,  -5.58660890e+00,  -1.38850054e-01, ...,
         -1.26545881e+00,  -3.58002005e+00,  -4.30527391e+00]])

In [48]:
mfcc_filepath = os.path.join("..", "data", "mfcc", "mfcc_dict.z")
joblib.dump(mfcc_dict, mfcc_filepath)

['../data/mfcc/mfcc_dict.z']

In [2]:
mfcc_dict = joblib.load(mfcc_filepath)

In [99]:
label_dir = os.path.join("..", "data", "labels")
df_labels = pd.read_csv(os.path.join(label_dir, "train_short.csv"))
df_labels[:5]

Unnamed: 0,ID,Class
0,0,siren
1,1,street_music
2,2,drilling
3,3,siren
4,4,dog_bark


In [101]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

train_mfcc_2D = [mfcc_dict[mfcc_id] for mfcc_id in df_labels.loc[:, "ID"]]
train_mfcc_2D = np.stack(train_mfcc_2D)
train_mfcc_1D = train_mfcc_2D.reshape(136, -1)
print(train_mfcc_2D.shape)
print(train_mfcc_1D.shape)
labels = df_labels.loc[:, "Class"]

label_enc = LabelEncoder()
int_labels = label_enc.fit_transform(labels)

one_hot_enc = OneHotEncoder()
y = one_hot_enc.fit_transform(int_labels.reshape(-1, 1))
y

(136, 20, 173)
(136, 3460)


<136x10 sparse matrix of type '<class 'numpy.float64'>'
	with 136 stored elements in Compressed Sparse Row format>

In [130]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D

num_classes = 10

mlp = keras.models.Sequential()
mlp.add(Dense(64, input_shape=(train_mfcc_1D.shape[1],), activation="relu"))
mlp.add(BatchNormalization())
mlp.add(Dense(64, activation="relu"))
mlp.add(BatchNormalization())
mlp.add(Dense(num_classes, activation="softmax"))

In [124]:
# test data TODO: use a function <.<

df_labels = pd.read_csv(os.path.join(label_dir, "test.csv"))
test_mfcc_2D = [mfcc_dict[mfcc_id] for mfcc_id in df_labels.loc[:, "ID"]]
test_mfcc_2D = np.stack(test_mfcc_2D)
test_mfcc_1D = test_mfcc_2D.reshape(test_mfcc_2D.shape[0], -1)
test_labels = df_labels.loc[:, "Class"]

test_label_enc = LabelEncoder()
test_int_labels = test_label_enc.fit_transform(test_labels)

test_one_hot_enc = OneHotEncoder()
test_y = test_one_hot_enc.fit_transform(test_int_labels.reshape(-1, 1))
test_y

<33x10 sparse matrix of type '<class 'numpy.float64'>'
	with 33 stored elements in Compressed Sparse Row format>

In [131]:
mlp.compile(optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

print(mlp.metrics_names)
for epoch in range(10):
    mlp.fit(train_mfcc_1D, y, epochs=10, batch_size=32, verbose=0)
    test_acc = mlp.evaluate(test_mfcc_1D, test_y, verbose=0)[1]
    train_acc = mlp.evaluate(train_mfcc_1D, y, verbose=0)[1]
    print(f"epoch {epoch + 1}: test_acc={test_acc:.3}, "
          f"train_acc={train_acc:.2}")

['loss', 'acc']
epoch 1: test_acc=0.515, train_acc=0.85
epoch 2: test_acc=0.273, train_acc=0.86
epoch 3: test_acc=0.545, train_acc=0.98
epoch 4: test_acc=0.424, train_acc=0.94
epoch 5: test_acc=0.485, train_acc=1.0
epoch 6: test_acc=0.333, train_acc=0.95
epoch 7: test_acc=0.333, train_acc=0.96
epoch 8: test_acc=0.394, train_acc=0.94
epoch 9: test_acc=0.424, train_acc=0.81
epoch 10: test_acc=0.424, train_acc=0.96
