## Importing Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

import IPython.display as ipd
import librosa
import librosa.display

## Defining Constants

In [2]:
path = "/Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Data/"
Model_path = "/Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/"
audio_file = "UrbanSound8K/audio/"
audio_dataset_path = path+audio_file
metadata = "UrbanSound8K/metadata/UrbanSound8K.csv"

num_epochs = 200
num_batch_size = 32

In [3]:
meta_data = pd.read_csv(path+metadata)

meta_data.shape

(8732, 8)

In [4]:
meta_data.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


## Creating Features

In [5]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, 
                                      res_type = "kaiser_fast")
    mfccs_features = librosa.feature.mfcc(y = audio, 
                                          sr = sample_rate,
                             # n_mfcc is a hyperparameter, we can try different different values such as 30, 50, 60 etc.
                                          n_mfcc = 40)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis = 0)
    
    return (mfccs_scaled_features)

## Iterating through every audio file for feature extraction using MFCC

In [6]:
extracted_features = []

for index_num, row in tqdm(meta_data.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path), "fold"+str(row["fold"])+"/", str(row["slice_file_name"]))
    final_class_labels = row["class"]
    data = features_extractor(file_name)
    extracted_features.append([data, final_class_labels])

8732it [04:46, 30.43it/s]


## Converting the extracted features to DataFrame

In [7]:
extracted_features_df = pd.DataFrame(data = extracted_features, columns = ["feature", "class"])

extracted_features_df.shape

(8732, 2)

In [8]:
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-217.35526, 70.22338, -130.38527, -53.282898,...",dog_bark
1,"[-424.09818, 109.34077, -52.919525, 60.86475, ...",children_playing
2,"[-458.79114, 121.38419, -46.520657, 52.00812, ...",children_playing
3,"[-413.89984, 101.66373, -35.42945, 53.036358, ...",children_playing
4,"[-446.60352, 113.68541, -52.402206, 60.302044,...",children_playing


## Splitting the dataset into independent and dependent features

In [9]:
X = np.array(extracted_features_df["feature"].values.tolist())
y = np.array(extracted_features_df["class"].values.tolist())

print(X.shape, y.shape)

(8732, 40) (8732,)


In [11]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

## Encoding the target

In [13]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

y = to_categorical(labelencoder.fit_transform(y))

y.shape

(8732, 10)

In [14]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

## Splitting the dataset into Train and Test

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6985, 40) (1747, 40) (6985, 10) (1747, 10)


## Finding number of classes in Target variable

In [16]:
num_labels = y.shape[1]

In [17]:
num_labels

10

## Model Creation

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## ANN Model

In [19]:
model = Sequential()
#First layer
model.add(Dense(units = 100, input_shape = (40,)))
model.add(Activation("relu"))
model.add(Dropout(0.5))
#Second layer
model.add(Dense(units = 200))
model.add(Activation("relu"))
model.add(Dropout(0.5))
#Third layer
model.add(Dense(units = 100))
model.add(Activation("relu"))
model.add(Dropout(0.5))
#Final layer
model.add(Dense(num_labels))
model.add(Activation("softmax"))

## Model Summary

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               4100      
                                                                 
 activation (Activation)     (None, 100)               0         
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 200)               20200     
                                                                 
 activation_1 (Activation)   (None, 200)               0         
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 100)               2

## Model Compilation

In [21]:
model.compile(loss = "categorical_crossentropy", metrics = ["accuracy"], optimizer = "adam")

## Model Training

In [41]:
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

checkpointer = ModelCheckpoint(filepath = Model_path+"audio_classification.hdf5", 
                               verbose =1, 
                               save_best_only = True)

start = datetime.now()

model.fit(X_train, y_train, 
          batch_size = num_batch_size, 
          epochs = num_epochs, 
          validation_data = (X_test, y_test),
          callbacks = [checkpointer],
          verbose = 1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/200
Epoch 1: val_loss improved from inf to 0.65800, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 2/200
Epoch 2: val_loss improved from 0.65800 to 0.65352, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 3/200
Epoch 3: val_loss did not improve from 0.65352
Epoch 4/200
Epoch 4: val_loss did not improve from 0.65352
Epoch 5/200
Epoch 5: val_loss improved from 0.65352 to 0.64729, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 6/200
Epoch 6: val_loss did not improve from 0.64729
Epoch 7/200
Epoch 7: val_loss did not improve from 0.64729
Epoch 8/200
Epoch 8: val_loss did not improve from 0.64729
Epoch 9/200
Epoch 9: val_loss improved from 0.64729 to 0.64524, saving model to /Users/p

Epoch 28/200
Epoch 28: val_loss improved from 0.64179 to 0.63455, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 29/200
Epoch 29: val_loss did not improve from 0.63455
Epoch 30/200
Epoch 30: val_loss did not improve from 0.63455
Epoch 31/200
Epoch 31: val_loss did not improve from 0.63455
Epoch 32/200
Epoch 32: val_loss did not improve from 0.63455
Epoch 33/200
Epoch 33: val_loss improved from 0.63455 to 0.63211, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 34/200
Epoch 34: val_loss improved from 0.63211 to 0.62920, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 35/200
Epoch 35: val_loss did not improve from 0.62920
Epoch 36/200
Epoch 36: val_loss did not improve from 0.62920
Epoch 3

Epoch 56/200
Epoch 56: val_loss did not improve from 0.62920
Epoch 57/200
Epoch 57: val_loss did not improve from 0.62920
Epoch 58/200
Epoch 58: val_loss did not improve from 0.62920
Epoch 59/200
Epoch 59: val_loss did not improve from 0.62920
Epoch 60/200
Epoch 60: val_loss did not improve from 0.62920
Epoch 61/200
Epoch 61: val_loss did not improve from 0.62920
Epoch 62/200
Epoch 62: val_loss did not improve from 0.62920
Epoch 63/200
Epoch 63: val_loss did not improve from 0.62920
Epoch 64/200
Epoch 64: val_loss did not improve from 0.62920
Epoch 65/200
Epoch 65: val_loss did not improve from 0.62920
Epoch 66/200
Epoch 66: val_loss did not improve from 0.62920
Epoch 67/200
Epoch 67: val_loss did not improve from 0.62920
Epoch 68/200
Epoch 68: val_loss did not improve from 0.62920
Epoch 69/200
Epoch 69: val_loss did not improve from 0.62920
Epoch 70/200
Epoch 70: val_loss did not improve from 0.62920
Epoch 71/200
Epoch 71: val_loss did not improve from 0.62920
Epoch 72/200
Epoch 72: v

Epoch 85: val_loss did not improve from 0.62404
Epoch 86/200
Epoch 86: val_loss did not improve from 0.62404
Epoch 87/200
Epoch 87: val_loss did not improve from 0.62404
Epoch 88/200
Epoch 88: val_loss did not improve from 0.62404
Epoch 89/200
Epoch 89: val_loss did not improve from 0.62404
Epoch 90/200
Epoch 90: val_loss did not improve from 0.62404
Epoch 91/200
Epoch 91: val_loss did not improve from 0.62404
Epoch 92/200
Epoch 92: val_loss did not improve from 0.62404
Epoch 93/200
Epoch 93: val_loss did not improve from 0.62404
Epoch 94/200
Epoch 94: val_loss did not improve from 0.62404
Epoch 95/200
Epoch 95: val_loss did not improve from 0.62404
Epoch 96/200
Epoch 96: val_loss did not improve from 0.62404
Epoch 97/200
Epoch 97: val_loss did not improve from 0.62404
Epoch 98/200
Epoch 98: val_loss did not improve from 0.62404
Epoch 99/200
Epoch 99: val_loss did not improve from 0.62404
Epoch 100/200
Epoch 100: val_loss did not improve from 0.62404
Epoch 101/200
Epoch 101: val_loss d

Epoch 115/200
Epoch 115: val_loss did not improve from 0.62404
Epoch 116/200
Epoch 116: val_loss did not improve from 0.62404
Epoch 117/200
Epoch 117: val_loss did not improve from 0.62404
Epoch 118/200
Epoch 118: val_loss did not improve from 0.62404
Epoch 119/200
Epoch 119: val_loss did not improve from 0.62404
Epoch 120/200
Epoch 120: val_loss improved from 0.62404 to 0.60655, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 121/200
Epoch 121: val_loss did not improve from 0.60655
Epoch 122/200
Epoch 122: val_loss did not improve from 0.60655
Epoch 123/200
Epoch 123: val_loss did not improve from 0.60655
Epoch 124/200
Epoch 124: val_loss did not improve from 0.60655
Epoch 125/200
Epoch 125: val_loss did not improve from 0.60655
Epoch 126/200
Epoch 126: val_loss did not improve from 0.60655
Epoch 127/200
Epoch 127: val_loss did not improve from 0.60655
Epoch 128/200
Epoch 128: val_loss

Epoch 144/200
Epoch 144: val_loss did not improve from 0.60655
Epoch 145/200
Epoch 145: val_loss did not improve from 0.60655
Epoch 146/200
Epoch 146: val_loss did not improve from 0.60655
Epoch 147/200
Epoch 147: val_loss did not improve from 0.60655
Epoch 148/200
Epoch 148: val_loss did not improve from 0.60655
Epoch 149/200
Epoch 149: val_loss did not improve from 0.60655
Epoch 150/200
Epoch 150: val_loss did not improve from 0.60655
Epoch 151/200
Epoch 151: val_loss did not improve from 0.60655
Epoch 152/200
Epoch 152: val_loss did not improve from 0.60655
Epoch 153/200
Epoch 153: val_loss did not improve from 0.60655
Epoch 154/200
Epoch 154: val_loss did not improve from 0.60655
Epoch 155/200
Epoch 155: val_loss did not improve from 0.60655
Epoch 156/200
Epoch 156: val_loss did not improve from 0.60655
Epoch 157/200
Epoch 157: val_loss did not improve from 0.60655
Epoch 158/200
Epoch 158: val_loss did not improve from 0.60655
Epoch 159/200
Epoch 159: val_loss did not improve from 

Epoch 173/200
Epoch 173: val_loss did not improve from 0.60655
Epoch 174/200
Epoch 174: val_loss did not improve from 0.60655
Epoch 175/200
Epoch 175: val_loss did not improve from 0.60655
Epoch 176/200
Epoch 176: val_loss did not improve from 0.60655
Epoch 177/200
Epoch 177: val_loss did not improve from 0.60655
Epoch 178/200
Epoch 178: val_loss did not improve from 0.60655
Epoch 179/200
Epoch 179: val_loss did not improve from 0.60655
Epoch 180/200
Epoch 180: val_loss did not improve from 0.60655
Epoch 181/200
Epoch 181: val_loss did not improve from 0.60655
Epoch 182/200
Epoch 182: val_loss did not improve from 0.60655
Epoch 183/200
Epoch 183: val_loss did not improve from 0.60655
Epoch 184/200
Epoch 184: val_loss did not improve from 0.60655
Epoch 185/200
Epoch 185: val_loss did not improve from 0.60655
Epoch 186/200
Epoch 186: val_loss did not improve from 0.60655
Epoch 187/200
Epoch 187: val_loss did not improve from 0.60655
Epoch 188/200
Epoch 188: val_loss did not improve from 

In [42]:
accuracy = model.evaluate(X_test, y_test, verbose = 0)
print(accuracy[1])

0.8191184997558594


## Working with Test Data

In [37]:
test_filename = "car_noise.wav"

# Feature extraction 
audio, sample_rate = librosa.load(path+test_filename, res_type = "kaiser_fast")
mfccs_features = librosa.feature.mfcc(y = audio, 
                                      sr = sample_rate, 
                                      n_mfcc = 40)

# Normalizing extracted features
mfccs_scaled_features = np.mean(mfccs_features.T, axis = 0)

print(mfccs_scaled_features)

# Converting normalized features into 1d array
mfccs_scaled_features = mfccs_scaled_features.reshape(1, -1)

print("\n", mfccs_scaled_features)
print("\n", mfccs_scaled_features.shape)

[-196.822       113.993126    -13.813408      0.40220982  -20.145586
   -4.7625513   -40.68413       4.171539    -18.973984     -3.0413852
  -19.736597      7.505515    -21.69197      -4.053084    -20.116352
    4.8145556   -14.817319      0.9286656   -14.061541      8.725959
   -6.6515527     6.45562      -4.6819024     0.5490028    -9.6971445
   -0.34578854   -8.68798      -0.6733791    -6.8771744     5.059755
  -10.099091     -0.2596309    -2.8073726     4.4415655    -7.439074
   -4.481123     -1.9385126     3.842505     -5.947936     -0.98801756]

 [[-196.822       113.993126    -13.813408      0.40220982  -20.145586
    -4.7625513   -40.68413       4.171539    -18.973984     -3.0413852
   -19.736597      7.505515    -21.69197      -4.053084    -20.116352
     4.8145556   -14.817319      0.9286656   -14.061541      8.725959
    -6.6515527     6.45562      -4.6819024     0.5490028    -9.6971445
    -0.34578854   -8.68798      -0.6733791    -6.8771744     5.059755
   -10.099091     -

## Predicting the test data

In [38]:
predicted_label = np.argmax(model.predict(mfccs_scaled_features),axis=1)

print(predicted_label)

[9]


## Finding the class of the predicted value

In [40]:
prediction_class = labelencoder.inverse_transform(predicted_label)

print(prediction_class[0])

street_music
