USING CNN FOR SOUND CLASSIFICATION



In [None]:
import numpy as np
import librosa 


In [None]:
audio , sample_rate = librosa.load('../input/urbansound8k/fold10/100648-1-2-0.wav',res_type='kaiser_fast')


In [None]:
audio.shape

In [None]:
sample_rate

In [None]:
mfcc = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=40)

In [None]:
mfcc

Feature Extraction refinement
In the prevous feature extraction stage, the MFCC vectors would vary in size for the different audio files (depending on the samples duration).

However, CNNs require a fixed size for all inputs. To overcome this we will zero pad the output vectors to make them all the same size.

In [None]:
max_pad_len = 174

def extract_features(filename):
    try:
        audio,sample_rate = librosa.load(filename,res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs,pad_width=((0,0),(0,pad_width)),mode='constant')
    except Exception as e:
        print('Error with : ',filename)
        return None
    return mfccs

In [None]:
from tqdm import tqdm
import pandas as pd

In [None]:
audio_dataset_path = '../input/urbansound8k'
metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')

In [None]:
import numpy as np
from tqdm import tqdm
extracted_features = []
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row['slice_file_name']))
    final_class_labels = row['class']
    data= extract_features(file_name)
    extracted_features.append([data,final_class_labels])

In [None]:
extracted_features_df = pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

In [None]:
x = np.array(extracted_features_df['feature'].tolist())
y = np.array(extracted_features_df['class'].tolist())

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

In [None]:
# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, yy, test_size=0.2, random_state = 42)

In [None]:
x_train.shape

In [None]:

num_rows = 40
num_columns = 174
num_channels = 1
x_train =  x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

In [None]:
x_train.shape

In [None]:
y_train

# CNN MODEL

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D,Dropout , GlobalAveragePooling2D,Dense
from sklearn import metrics

In [None]:
num_labels = yy.shape[1]
filter_size = 2

In [None]:

# Construct model 
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint 
from datetime import datetime 

#num_epochs = 12
#num_batch_size = 128

num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
# Evaluating the model on the training and testing set
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

In [None]:

def print_prediction(file_name):
    prediction_feature = extract_features(file_name) 
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("The predicted class is:", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)): 
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f') )

In [None]:
print_prediction('../input/gunshot/GunShotSnglShotIn PE1097906.wav')

In [None]:
import IPython.display as ipd
ipd.Audio('../input/gunshot/GunShotSnglShotIn PE1097906.wav')