In [None]:
import numpy as np
import pandas as pd
import os

import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt

from datetime import datetime
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 

 # **Introduction to the Dataset**

The UrbanSound8k dataset is a collection of 8732 sound files classified in 10 different classes:

> 1. air_conditioner;
> 2. car_horn; 
> 3. children_playing;
> 4. dog_bark;
> 5. drilling;
> 6. enginge_idling;
> 7. gun_shot;
> 8. jackhammer;
> 9. siren;
> 10. street_music. 

These sounds are representative of real urban sounds. The dataset also contains a .csv file with metadata about each audiofile.

# Objective

The goal with this notebook is to use the UrbanSound8k dataset to create a classification model that can correctly identify the sound of real life audio files. Then, this classification model should be able to classify diffent segments in a single audio file.

# Methodology

The methodology used in this notebook is the following:

1. Understand the data
2. Extract features using librosa library
3. Separate our dataset in train, test and validation
4. Create and run a Keras Model
5. Collect accuracy results data for multiple combination of features
6. Analyse the results and select the best combination of features
7. Run the model with real audio data

In this model I'll not be using image analysis for spectrogram images. All the features will be using data extracted directly form the audio.

# Exploring the Dataset

In [None]:
def plot_sound(filename):
    plt.figure(figsize=(30,4))
    data,sample_rate = librosa.load(filename)
    librosa.display.waveplot(data,sr=sample_rate)

In [None]:
# siren sound
filename = '../input/urbansound8k/fold4/24347-8-0-12.wav' 
plot_sound(filename)
ipd.Audio(filename)

In [None]:
# drilling sound
filename = '../input/urbansound8k/fold7/104625-4-0-52.wav'
plot_sound(filename)
ipd.Audio(filename)

In [None]:
# Reading metadata file
metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')
metadata.head()

In [None]:
# Class distribution
dist = metadata['class'].value_counts()
print(dist)

As you can see, the data isn't evenly distibuted between classes.
Since there's significantly less data for gun_shots and car_horn, there is a chance that we have a different accuracy for this classes.
This may not be a problem if this sample is distinct enough from the other ones.

# Extracting Features

There is a lot of ways to extract features from a sound file. The library librosa has a lot of options for this kind of feature extracting.


First, we will test extracting mfccs (Mel Frequency Cepstral Co-efficients) from the soundfiles. Then we will be testing extracting Spectral Contrast information from the soundfiles. Lastly we will be testing extracting Chroma Stft information.

All the functions used for these testing are listed below

In [None]:
### MFCC based feature extraction:

def extract_features_mfcc_mean(file_name):
    '''Extract mfcc features and calculate the mean for each segment. Returns a 60 size 1D array'''
    audio, sample_rate = librosa.load(file_name,res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc = 60)
    mfccs_mean = np.mean(mfccs.T,axis=0)
    return mfccs_mean

def extract_features_mfcc_mean_var_std(file_name):
    '''Extract mfcc features and calculate the mean, var, max, min and std for each segment. Returns a 300 size 1D array'''
    audio, sample_rate = librosa.load(file_name,res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc = 60)
    mfccs_mean = np.mean(mfccs.T,axis = 0)
    mfccs_variance = np.var(mfccs.T,axis = 0)
    mfccs_max = np.amax(mfccs.T,axis = 0)
    mfccs_min = np.amin(mfccs.T,axis = 0)
    mfccs_std = np.std(mfccs.T, axis = 0)
    mfccs_features = np.hstack([mfccs_mean, mfccs_variance, mfccs_max, mfccs_min, mfccs_std])
    return mfccs_features

def extract_features_mfcc_mean_std(file_name):
    '''Extract mfcc features and calculate the mean and std for each segment. Returns a 120 size 1D array'''
    audio, sample_rate = librosa.load(file_name,res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=60)
    mfccs_mean = np.mean(mfccs.T,axis=0)
    mfccs_std = np.std(mfccs.T, axis=0)
    mfccs_features = np.hstack([mfccs_mean, mfccs_std])
    return mfccs_features


### Spectral Contrast based feature extraction:
def extract_features_spectral_contrast(file_name):
    '''Extract Spectral Contrast features and calculate the mean and std for each segment. Returns a 8 size 1D array'''
    audio, sample_rate = librosa.load(file_name,res_type='kaiser_fast') 
    spec_con = librosa.feature.spectral_contrast(y=audio, sr=sample_rate, n_bands=3)
    spec_con_mean = np.mean(spec_con.T,axis=0)
    spec_con_std = np.std(spec_con.T, axis=0)
    spec_con_feature = np.hstack([spec_con_mean, spec_con_std])
    return spec_con_feature


### Chroma Stft based feature extraction:
def extract_features_chroma_stft(file_name):
    '''Extract Chroma Stft features and calculate the mean and std for each segment. Returns a 24 size 1D array'''
    audio, sample_rate = librosa.load(file_name,res_type='kaiser_fast') 
    stft = np.abs(librosa.stft(audio))
    chroma_stft = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    chroma_mean = np.mean(chroma_stft.T,axis=0)
    chroma_std = np.std(chroma_stft.T, axis=0)
    chroma_feature = np.hstack([chroma_mean, chroma_std])
    return chroma_feature

We'll be using MFCCs Mean and STD. You can find the accuracy analysis later in this notebook.

In [None]:
feature_extraction_function = extract_features_mfcc_mean_std

In [None]:
fulldatasetpath = '../input/urbansound8k/'
features = []


# extracting features for each audio file:
for index, row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    class_label = row["class"]
    data = feature_extraction_function(file_name)
    features.append([data, class_label])

# converting to dataframe
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])
print('Finished feature extraction from ', len(featuresdf), ' files')

In [None]:
# Dividing features and labels in arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

print('Features Shape:', X.shape)
print('Class Shape:', y.shape)

Since we want the output of our prediction model to be a probability for each class, we will use One Hot Encoder. The final result for the prediction will be the one with the highest probability.

In [None]:
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))
print('Label Encoder')
print(y)
print(yy)

We'll also be separating our dataset in 3.
With this separation, we can validate our model without using any audio files that were involved in our train and test process.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)
x_train, x_val, y_train, y_val  = train_test_split(x_train, y_train, test_size=0.25, random_state = 42)

print('Train Shapes')
print(x_train.shape)
print(y_train.shape)
print()

print('Test Shapes')
print(x_test.shape)
print(y_test.shape)
print()

print('Validation Shapes')
print(x_val.shape)
print(y_val.shape)

# Creating the Classification Model

We'll be using keras for model training.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from sklearn import metrics
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

num_labels = yy.shape[1] # dynamically selects the number of labels for the model 
n_features = X.shape[1] # dynamically selects the number of features for the model

# Creating our model
model = Sequential()
model.add(Dense(256, input_shape=(n_features,)))
model.add(Activation('relu')) # rectified linear unit activation
model.add(Dropout(0.2)) # dropout to minimize overfitting

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(num_labels)) # last layer needs to have the same number of labels
model.add(Activation('softmax')) # since we're using one hot enconder, let's use sotfmax on the last layer


# Model Compilation
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
model.summary()

We'll use 3 callbacks for our model fit:

> 1. ModelCheckpoint - Save our model in a .h5 file.
> 2. ReduceLROnPlateau - Reduce learning rate when the learning estagnates.
> 3. EarlyStopping - Stop traing when the model stops improving 'val_loss' metric after 10 epochs.


In [None]:
# training parameters
num_epochs = 100
num_batch_size = 32

# callback definition
checkpoint = 'audio_classification_best_model.h5'
mc = ModelCheckpoint(checkpoint, monitor = 'val_loss', mode = 'min', verbose = 1, save_best_only = True)
rp = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 10, verbose = 1, mode ='min', min_lr = 0.00000001)
es = EarlyStopping(monitor='val_loss', mode='min', verbose = 1, patience=10)

# Model Training
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_val, y_val), verbose=1, callbacks=[es, rp, mc])


In [None]:
# testing model with testing data
score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

# testing model with validation data
score = model.evaluate(x_val, y_val, verbose=0)
print("Validation Accuracy: ", score[1])

Using extract_features_mfcc_mean_std function, the testing accuracy rate is usually around 0,90~0,93

In [None]:
# testing model
score = model.evaluate(x_val, y_val, verbose=0)
print("Validation Accuracy: ", score[1])

Using extract_features_mfcc_mean_std function, the validation accuracy rate is also usually around 0,90~0,93

# Accuracy Analysis with Other Features and Model Configuration (Training History):

First I tested different n_mfccs:

        MFCCs numbers using only mean for each segment

        --- n_mfccs_mean = 100, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.6542644500732422;
        Tentativa 2 Accuracy: 0.7012020349502563;
        Tentativa 3 Accuracy: 0.6891814470291138

        --- n_mfccs_mean = 80, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.6742988228797913;
        Tentativa 2 Accuracy: 0.6645678281784058;
        Tentativa 3 Accuracy: 0.6674298644065857

        --- n_mfccs_mean = 60, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.6176302433013916;
        Tentativa 2 Accuracy: 0.6325128674507141;
        Tentativa 3 Accuracy: 0.6330853104591370

        --- n_mfccs_mean = 40, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.5523754954338074;
        Tentativa 2 Accuracy: 0.635374903678894;
        Tentativa 3 Accuracy: 0.6136233806610107

        --- n_mfccs_mean = 20, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.48654836416244507;
        Tentativa 2 Accuracy: 0.6159129738807678;
        Tentativa 3 Accuracy: 0.5317687392234802

        --- n_mfccs_mean = 10, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.47853463888168335;
        Tentativa 2 Accuracy: 0.45907270908355713;
        Tentativa 3 Accuracy: 0.45621064305305480


With these results, I had a benchmark for the testing accuracy rate.
I also noticied that the gains for n_mfccs >60 were very small.


After these initial testings I started looking for more features for the mfccs segments:

        MFCCs numbers using only mean and std for each segment

        --- n_mfccs_mean_std = 100, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.8248425722122192
        Tentativa 2 Accuracy: 0.8236977458000183
        Tentativa 3 Accuracy: 0.805380642414093

        --- n_mfccs_mean_std = 80, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.8168288469314575
        Tentativa 2 Accuracy: 0.8093875050544739
        Tentativa 3 Accuracy: 0.8185460567474365

        --- n_mfccs_mean_std = 60, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.7767601609230042
        Tentativa 2 Accuracy: 0.782484233379364
        Tentativa 3 Accuracy: 0.7985117435455322

        --- n_mfccs_mean_std = 40, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.7784773707389832
        Tentativa 2 Accuracy: 0.7641671299934387
        Tentativa 3 Accuracy: 0.7710360884666443

        --- n_mfccs_mean_std = 20, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.7063537240028381
        Tentativa 2 Accuracy: 0.7000572681427002
        Tentativa 3 Accuracy: 0.6748712062835693

        --- n_mfccs_mean_std = 10, Epochs = 10 ---
        Tentativa 1 Accuracy: 0.5792787671089172;
        Tentativa 2 Accuracy: 0.5970234870910645
        Tentativa 3 Accuracy: 0.6130509376525879


With mean and std I noticed an increment in testing accuracy.

Also, surprisingly the gunshot classifications started to become more precise.
Since the gunshot usually is a fast and loud sound, std might be a better solution than just mean for the classification.

I started testing some more calculations over the mfccs segments, like  variance, min, max,
but no other combination had the same results than mean and std.



After that I started testing with callbacks for the model
Fist I used early stopping:

        MFCCs numbers using only mean and std for each segment and keras early stopping

        --- n_mfccs_mean_std = 100 ---
        Tentativa 1 Accuracy: 0.8992558717727661, Epochs = 37/50
        Tentativa 2 Accuracy: 0.8878076672554016, Epochs = 44/50
        Tentativa 3 Accuracy: 0.9055523872375488, Epochs = 50/50

        --- n_mfccs_mean_std = 80 ---
        Tentativa 1 Accuracy: 0.8969662189483643, Epochs = 35/50
        Tentativa 2 Accuracy: 0.8946765661239624, Epochs = 35/50
        Tentativa 3 Accuracy: 0.8815111517906189, Epochs = 27/50

        --- n_mfccs_mean_std = 60 ---
        Tentativa 1 Accuracy: 0.9084144234657288, Epochs = 48/50
        Tentativa 2 Accuracy: 0.8872352838516235, Epochs = 39/50
        Tentativa 3 Accuracy: 0.8889524936676025, Epochs = 44/50

        --- n_mfccs_mean_std = 40 ---
        Tentativa 1 Accuracy: 0.8609043955802917, Epochs = 31/50
        Tentativa 2 Accuracy: 0.8717802166938782, Epochs = 37/50
        Tentativa 3 Accuracy: 0.8660560846328735, Epochs = 32/50

        --- n_mfccs_mean_std = 20 ---
        Tentativa 1 Accuracy: 0.8156840205192566, Epochs = 30/50
        Tentativa 2 Accuracy: 0.8156840205192566, Epochs = 37/50
        Tentativa 3 Accuracy: 0.8420149087905884, Epochs = 59/50

With early stopping I noticied a new increment in testing accuracy.
I could also see that the number of epochs used in the previous testing was limiting the testing accuracy

I noticied that n_mfcc = 60 is the best number for mfccs segments.
Considering this, I only used this configuration for my mfccs models.




Then I started experimenting with some other features that the librosa library has:


First, let's try Spectral_Contrast:

        Spectral_Contrast using mean, std and early stopping

        Tentativa 1 Accuracy: 0.587292492389679, Epochs = 74/100
        Tentativa 2 Accuracy: 0.578133940696716, Epochs = 70/100
        Tentativa 3 Accuracy: 0.6113337278366089, Epochs = 98/100

The results were significantly lower than mfccs.


Now , let's try Chroma_Stft:

        Chroma_Stft using mean, std and early stopping

        Tentativa 1 Accuracy: 0.659416139125824, Epochs = 78/100
        Tentativa 2 Accuracy: 0.6765884160995483, Epochs = 100/100
        Tentativa 3 Accuracy: 0.6685746908187866, Epochs = 97/100

The results were also significantly lower than mfccs.



With these trainings I was quite happy with the 60 mfccs, mean, std and early stopping results.
Lastly I added the Reduce Learning On Plateau callback to cut off the learning when the results were satisfatory.


#### Model Chosen:

        features:
            n_MFCC = 60 with mean and std for each segment
        
        model:
            ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 10, verbose = 1, mode ='min', min_lr = 0.00000001)
            ModelCheckpoint(checkpoint, monitor = 'val_loss', mode = 'min', verbose = 1, save_best_only = True)
            EarlyStopping(monitor='val_loss', mode='min', verbose = 1, patience=10)
        
        results bechmark:
            Accuracy: 0.9273039698600769


All the functions used for this training is on this notebook.

# Applying the Model

In [None]:
def print_prediction_mfcc_mean_std(filename):
    '''Classify audio files applying a model that uses mfcc features'''
    prediction_feature = np.array([extract_features_mfcc_mean_std(filename)])
    predicted_vector = model.predict_classes(prediction_feature)
    predicted_class = le.inverse_transform(predicted_vector) 
    print("\n", "Class: ", predicted_class[0], '\n') 

    predicted_proba_vector = model.predict_proba(prediction_feature) 
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)):
        category = le.inverse_transform(np.array([i]))
        print(category[0], ": ", format(predicted_proba[i], '.2f'), sep='')
    print('\n', '=-' *20, sep='')

In [None]:
# Testing File 1
print('Expected drilling')
filename = '../input/urbansound8k/fold7/104625-4-0-52.wav'
print_prediction_mfcc_mean_std(filename)

In [None]:
# Testing File 2
print('Expected siren')
filename = '../input/urbansound8k/fold4/24347-8-0-12.wav'
print_prediction_mfcc_mean_std(filename)

In [None]:
# Testing File 3
print('Expected dog_bark')
filename = '../input/urbansound8k/fold6/101281-3-0-5.wav'
print_prediction_mfcc_mean_std(filename)

The model seems to be classifying the audios correctly!

# Creating splicing function to classify audios in the same file

Now that our model is ready, let's create a function to identify segments in an audio file to classify them separately

In [None]:
def splice_audio(filename):
   
    '''Splicing audio files, returns a list with the audios segments'''
    
    data,sample_rate = librosa.load(filename)
    
    # Splicing
    splits_indexes = librosa.effects.split(data,50)
    audios = []
    for n in range(len(splits_indexes)):
        start = splits_indexes[n][0]
        end = splits_indexes[n][1]
        data_spliced = data[start:end]
        audios.append(data_spliced)
        
    return audios


def predict_spliced_audio(audios):
    
    '''Classify each audio segment in a list'''
    
    for n in range(len(audios)):
        mfccs = librosa.feature.mfcc(y=audios[n], sr=sample_rate, n_mfcc=60)
        mfccs_mean = np.mean(mfccs.T,axis=0)
        mfccs_std = np.std(mfccs.T, axis=0)
        mfccs_features = np.hstack([mfccs_mean, mfccs_std])
        prediction_feature = np.array([mfccs_features])

        predicted_vector = model.predict_classes(prediction_feature)
        predicted_class = le.inverse_transform(predicted_vector) 
        print("\n", "Class: ", predicted_class[0], '\n') 

        predicted_proba_vector = model.predict_proba(prediction_feature) 
        predicted_proba = predicted_proba_vector[0]

        for i in range(len(predicted_proba)): 
            category = le.inverse_transform(np.array([i]))
            print(category[0], ": ", format(predicted_proba[i], '.2f'), sep='')
        print('\n', '=-' *20)
        
        

def splice_and_classify(filename):
    
    '''In the same function, splice and classify all the identified audio segments in a file and print the results'''
    
    data,sample_rate = librosa.load(filename)
    
    # Splicing
    splits_indexes = librosa.effects.split(data,50)
    audios = []
    for n in range(len(splits_indexes)):
        start = splits_indexes[n][0]
        end = splits_indexes[n][1]
        data_spliced = data[start:end]
        audios.append(data_spliced)
    
    # Using spliced audio segments
    for n in range(len(audios)):
        
        # extracting features
        mfccs = librosa.feature.mfcc(y=audios[n], sr=sample_rate, n_mfcc=60)
        mfccs_mean = np.mean(mfccs.T,axis=0)
        mfccs_std = np.std(mfccs.T, axis=0)
        features = np.hstack([mfccs_mean, mfccs_std])
        prediction_feature = np.array([features])
        
        # predicts using the trained model
        predicted_vector = model.predict_classes(prediction_feature)
        predicted_class = le.inverse_transform(predicted_vector) 
        print("\n", "Class: ", predicted_class[0], '\n') 
        
        # calculating the probability for each class
        predicted_proba_vector = model.predict_proba(prediction_feature) 
        predicted_proba = predicted_proba_vector[0]
        
        # printing results
        for i in range(len(predicted_proba)): 
            category = le.inverse_transform(np.array([i]))
            print(category[0], ": ", format(predicted_proba[i], '.2f'), sep='')
        print('\n', '=-' *20)

Using some youtube videos I created an audio file to check, if every classification is correct.
These are the videos I used to make this file:

> 1.  siren (0:24-0:30)
    https://www.youtube.com/watch?v=TXpm0gKG17k
> 2.  dog_bark (0:30-0:37)
    https://www.youtube.com/watch?v=iuy-oOJCOoM
> 3.  gun_shot (0:00-0:05)
    https://www.youtube.com/watch?v=-lbUHipN0F0
> 4.  children_playing (0:05-0:15)
    https://www.youtube.com/watch?v=7iDxLF2PWFw
 

In [None]:
filename = '../input/custom-test-audio/custom_test_audio.wav'
plot_sound(filename)
splice_and_classify(filename)
ipd.Audio(filename)

Every classification is correct!

#######

# Bonus Section

You can use the .h5 model that is already created to classify other audio files.With only the cells below you can make the classification, there's no need to run any cell above this section.

In [None]:
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import load_model

In [None]:
def splice_and_classify_model_input(filename, model):
    
    '''Alternate version for the splice_and_classify function. Please use .wav audio files.'''
    

    plt.figure(figsize=(30,4))
    data,sample_rate = librosa.load(filename)
    librosa.display.waveplot(data,sr=sample_rate)
    
    

    splits_indexes = librosa.effects.split(data,50)
    audios = []
    for n in range(len(splits_indexes)):
        start = splits_indexes[n][0]
        end = splits_indexes[n][1]
        data_spliced = data[start:end]
        audios.append(data_spliced)


    for n in range(len(audios)):

        mfccs = librosa.feature.mfcc(audios[n], sr=sample_rate, n_mfcc=60)
        mfccs_mean = np.mean(mfccs.T,axis=0)
        mfccs_std = np.std(mfccs.T, axis=0)
        mfccs_features = np.hstack([mfccs_mean, mfccs_std])

 
        prediction_feature = np.array([mfccs_features])
        predicted_vector = model.predict_classes(prediction_feature)
        le = LabelEncoder()
        y = np.array(['dog_bark', 'children_playing', 'car_horn', 'air_conditioner', 'street_music', 'gun_shot', 'siren', 'engine_idling', 'jackhammer', 'drilling'])
        yy = to_categorical(le.fit_transform(y))
        predicted_class = le.inverse_transform(predicted_vector) 
        print("\n", "Class: ", predicted_class[0], '\n') 


        predicted_proba_vector = model.predict_proba(prediction_feature) 
        predicted_proba = predicted_proba_vector[0]


        for i in range(len(predicted_proba)):
            category = le.inverse_transform(np.array([i]))
            print(category[0], ": ", format(predicted_proba[i], '.2f'), sep='')
        print('\n', '=-' *20, sep='')

In [None]:
model = load_model('../input/modelo/audio_classification_best_model (2).h5')

In [None]:
filename = '../input/custom-test-audio/custom_test_audio.wav'
splice_and_classify_model_input(filename, model)