In [1]:
# Imports

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
from tqdm import tqdm
from scipy.io import wavfile as wav

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
label_dict = {0 : 'air conditioner',
              1 : 'car horn', 
              2 : 'children playing', 
              3 : 'dog bark', 
              4 : 'drilling', 
              5 : 'engine idling',
              6 : 'gun shot',
              7 : 'jackhammer',
              8 : 'siren',
              9 : 'street music'
}

audio_dataset_path='UrbanSound8K/audio/'
metadata=pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')

In [4]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
    
    return mfccs_scaled_features

In [5]:
extracted_features=[]

for index_num, row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path), 'fold'+str(row["fold"])+'/', str(row["slice_file_name"]))
    final_class_labels = row["class"]
    data = features_extractor(file_name)
    extracted_features.append([data, final_class_labels])
    

8732it [06:56, 20.95it/s]


In [6]:
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-215.79301, 71.66612, -131.81377, -52.091328,...",dog_bark
1,"[-424.68677, 110.56227, -54.148235, 62.01073, ...",children_playing
2,"[-459.56467, 122.800354, -47.92471, 53.265694,...",children_playing
3,"[-414.55377, 102.896904, -36.66495, 54.18041, ...",children_playing
4,"[-447.397, 115.0954, -53.809113, 61.608585, 1....",children_playing


In [7]:
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [8]:
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [12]:
num_labels=y.shape[1]

In [13]:
# Model

model=Sequential()
###first layer
model.add(Dense(100,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 100)               4100      
                                                                 
 activation_3 (Activation)   (None, 100)               0         
                                                                 
 dropout_3 (Dropout)         (None, 100)               0         
                                                                 
 dense_4 (Dense)             (None, 200)               20200     
                                                                 
 activation_4 (Activation)   (None, 200)               0         
                                                                 
 dropout_4 (Dropout)         (None, 200)               0         
                                                                 
 dense_5 (Dense)             (None, 100)              

In [14]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [15]:
num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)

Epoch 1/100
Epoch 1: val_loss improved from inf to 2.29504, saving model to saved_models/audio_classification.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 2.29504 to 2.28200, saving model to saved_models/audio_classification.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 2.28200 to 2.27510, saving model to saved_models/audio_classification.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 2.27510 to 2.27148, saving model to saved_models/audio_classification.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 2.27148 to 2.26675, saving model to saved_models/audio_classification.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 2.26675 to 2.23309, saving model to saved_models/audio_classification.hdf5
Epoch 7/100
Epoch 7: val_loss improved from 2.23309 to 2.21801, saving model to saved_models/audio_classification.hdf5
Epoch 8/100
Epoch 8: val_loss improved from 2.21801 to 2.15866, saving model to saved_models/audio_classification.hdf5
Epoch 9/100
Epoch 9: val_loss improved from 2.15866 

Epoch 26/100
Epoch 26: val_loss improved from 1.57207 to 1.55814, saving model to saved_models/audio_classification.hdf5
Epoch 27/100
Epoch 27: val_loss improved from 1.55814 to 1.49919, saving model to saved_models/audio_classification.hdf5
Epoch 28/100
Epoch 28: val_loss improved from 1.49919 to 1.45715, saving model to saved_models/audio_classification.hdf5
Epoch 29/100
Epoch 29: val_loss improved from 1.45715 to 1.45287, saving model to saved_models/audio_classification.hdf5
Epoch 30/100
Epoch 30: val_loss improved from 1.45287 to 1.42901, saving model to saved_models/audio_classification.hdf5
Epoch 31/100
Epoch 31: val_loss improved from 1.42901 to 1.40111, saving model to saved_models/audio_classification.hdf5
Epoch 32/100
Epoch 32: val_loss improved from 1.40111 to 1.34480, saving model to saved_models/audio_classification.hdf5
Epoch 33/100
Epoch 33: val_loss improved from 1.34480 to 1.30036, saving model to saved_models/audio_classification.hdf5
Epoch 34/100
Epoch 34: val_loss 

Epoch 51/100
Epoch 51: val_loss did not improve from 0.98241
Epoch 52/100
Epoch 52: val_loss did not improve from 0.98241
Epoch 53/100
Epoch 53: val_loss improved from 0.98241 to 0.95789, saving model to saved_models/audio_classification.hdf5
Epoch 54/100
Epoch 54: val_loss improved from 0.95789 to 0.95505, saving model to saved_models/audio_classification.hdf5
Epoch 55/100
Epoch 55: val_loss did not improve from 0.95505
Epoch 56/100
Epoch 56: val_loss improved from 0.95505 to 0.94979, saving model to saved_models/audio_classification.hdf5
Epoch 57/100
Epoch 57: val_loss improved from 0.94979 to 0.91388, saving model to saved_models/audio_classification.hdf5
Epoch 58/100
Epoch 58: val_loss did not improve from 0.91388
Epoch 59/100
Epoch 59: val_loss improved from 0.91388 to 0.90354, saving model to saved_models/audio_classification.hdf5
Epoch 60/100
Epoch 60: val_loss did not improve from 0.90354
Epoch 61/100
Epoch 61: val_loss did not improve from 0.90354
Epoch 62/100
Epoch 62: val_lo

Epoch 78/100
Epoch 78: val_loss improved from 0.83553 to 0.82161, saving model to saved_models/audio_classification.hdf5
Epoch 79/100
Epoch 79: val_loss did not improve from 0.82161
Epoch 80/100
Epoch 80: val_loss improved from 0.82161 to 0.81342, saving model to saved_models/audio_classification.hdf5
Epoch 81/100
Epoch 81: val_loss did not improve from 0.81342
Epoch 82/100
Epoch 82: val_loss did not improve from 0.81342
Epoch 83/100
Epoch 83: val_loss did not improve from 0.81342
Epoch 84/100
Epoch 84: val_loss improved from 0.81342 to 0.79474, saving model to saved_models/audio_classification.hdf5
Epoch 85/100
Epoch 85: val_loss did not improve from 0.79474
Epoch 86/100
Epoch 86: val_loss improved from 0.79474 to 0.78284, saving model to saved_models/audio_classification.hdf5
Epoch 87/100
Epoch 87: val_loss did not improve from 0.78284
Epoch 88/100
Epoch 88: val_loss did not improve from 0.78284
Epoch 89/100
Epoch 89: val_loss improved from 0.78284 to 0.77805, saving model to saved_m

<keras.callbacks.History at 0x7fabe539d220>

In [16]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.7727532982826233


In [17]:
filename='UrbanSound8K/audio/fold3/6988-5-0-1.wav'
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)
predicted_label=model.predict(mfccs_scaled_features)
print(predicted_label)
prediction_class = np.argmax(predicted_label, axis=1)
print(label_dict[prediction_class[0]])

[-124.41564     162.64157     -33.432667     27.59999       6.6116724
   21.610598      2.9962502    19.552324      1.294804     17.197958
   -1.8212044    13.08889       3.2787871     3.667115      0.96994126
    7.140642     -0.7302965     4.0208635     3.9708393     3.6621003
    4.8365865     4.4332657     4.394522      2.5353334     8.956006
    3.222451      6.7038193     4.445663      6.2371874     4.125844
    3.1068814     1.4973228     1.8097677     0.9489547     2.7819495
    2.6311927     4.027545      1.5059777     3.3667514     3.2168236 ]
[[-124.41564     162.64157     -33.432667     27.59999       6.6116724
    21.610598      2.9962502    19.552324      1.294804     17.197958
    -1.8212044    13.08889       3.2787871     3.667115      0.96994126
     7.140642     -0.7302965     4.0208635     3.9708393     3.6621003
     4.8365865     4.4332657     4.394522      2.5353334     8.956006
     3.222451      6.7038193     4.445663      6.2371874     4.125844
     3.1068814  