## Importing Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

import IPython.display as ipd
import librosa
import librosa.display

## Defining Constants

In [33]:
path = "/Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Data/"
Model_path = "/Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/"
audio_file = "UrbanSound8K/audio/"
audio_dataset_path = path+audio_file
metadata = "UrbanSound8K/metadata/UrbanSound8K.csv"

num_epochs = 100
num_batch_size = 32

In [3]:
meta_data = pd.read_csv(path+metadata)

meta_data.shape

(8732, 8)

In [4]:
meta_data.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


## Creating Features

In [5]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, 
                                      res_type = "kaiser_fast")
    mfccs_features = librosa.feature.mfcc(y = audio, 
                                          sr = sample_rate,
                             # n_mfcc is a hyperparameter, we can try different different values such as 30, 50, 60 etc.
                                          n_mfcc = 40)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis = 0)
    
    return (mfccs_scaled_features)

## Iterating through every audio file for feature extraction using MFCC

In [6]:
extracted_features = []

for index_num, row in tqdm(meta_data.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path), "fold"+str(row["fold"])+"/", str(row["slice_file_name"]))
    final_class_labels = row["class"]
    data = features_extractor(file_name)
    extracted_features.append([data, final_class_labels])

8732it [04:57, 29.35it/s]


## Converting the extracted features to DataFrame

In [7]:
extracted_features_df = pd.DataFrame(data = extracted_features, columns = ["feature", "class"])

extracted_features_df.shape

(8732, 2)

In [8]:
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-217.35526, 70.22338, -130.38527, -53.282898,...",dog_bark
1,"[-424.09818, 109.34077, -52.919525, 60.86475, ...",children_playing
2,"[-458.79114, 121.38419, -46.520657, 52.00812, ...",children_playing
3,"[-413.89984, 101.66373, -35.42945, 53.036358, ...",children_playing
4,"[-446.60352, 113.68541, -52.402206, 60.302044,...",children_playing


## Splitting the dataset into independent and dependent features

In [20]:
X = np.array(extracted_features_df["feature"].values.tolist())
y = np.array(extracted_features_df["class"].values.tolist())

print(X.shape, y.shape)

(8732, 40) (8732,)


## Encoding the target

In [21]:
y = np.array(pd.get_dummies(y, dtype = int))

In [22]:
y.shape

(8732, 10)

In [23]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

## Splitting the dataset into Train and Test

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6985, 40) (1747, 40) (6985, 10) (1747, 10)


## Finding number of classes in Target variable

In [26]:
num_labels = y.shape[1]

In [27]:
num_labels

10

## Model Creation

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## ANN Model

In [30]:
model = Sequential()
#First layer
model.add(Dense(units = 100, input_shape = (40,)))
model.add(Activation("relu"))
model.add(Dropout(0.5))
#Second layer
model.add(Dense(units = 200))
model.add(Activation("relu"))
model.add(Dropout(0.5))
#Third layer
model.add(Dense(units = 100))
model.add(Activation("relu"))
model.add(Dropout(0.5))
#Final layer
model.add(Dense(num_labels))
model.add(Activation("softmax"))

## Model Summary

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 100)               4100      
                                                                 
 activation (Activation)     (None, 100)               0         
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 200)               20200     
                                                                 
 activation_1 (Activation)   (None, 200)               0         
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_3 (Dense)             (None, 100)              

## Model Compilation

In [32]:
model.compile(loss = "categorical_crossentropy", metrics = ["accuracy"], optimizer = "adam")

## Model Training

In [38]:
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

checkpointer = ModelCheckpoint(filepath = Model_path+"audio_classification.hdf5", 
                               verbose =1, 
                               save_best_only = True)

start = datetime.now()

model.fit(X_train, y_train, 
          batch_size = num_batch_size, 
          epochs = num_epochs, 
          validation_data = (X_test, y_test),
          callbacks = [checkpointer],
          verbose = 1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.70684, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.70684 to 0.69705, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69705
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69705
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69705
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69705
Epoch 7/100
Epoch 7: val_loss improved from 0.69705 to 0.69178, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 8/100
Epoch 8: val_loss improved from 0.69178 to 0.67978, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Le

Epoch 28/100
Epoch 28: val_loss did not improve from 0.66916
Epoch 29/100
Epoch 29: val_loss did not improve from 0.66916
Epoch 30/100
Epoch 30: val_loss did not improve from 0.66916
Epoch 31/100
Epoch 31: val_loss improved from 0.66916 to 0.66803, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 32/100
Epoch 32: val_loss did not improve from 0.66803
Epoch 33/100
Epoch 33: val_loss improved from 0.66803 to 0.65885, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 34/100
Epoch 34: val_loss improved from 0.65885 to 0.65354, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 35/100
Epoch 35: val_loss did not improve from 0.65354
Epoch 36/100
Epoch 36: val_loss did not improve from 0.65354
Epoch 3

Epoch 54/100
Epoch 54: val_loss did not improve from 0.63094
Epoch 55/100
Epoch 55: val_loss did not improve from 0.63094
Epoch 56/100
Epoch 56: val_loss did not improve from 0.63094
Epoch 57/100
Epoch 57: val_loss did not improve from 0.63094
Epoch 58/100
Epoch 58: val_loss did not improve from 0.63094
Epoch 59/100
Epoch 59: val_loss did not improve from 0.63094
Epoch 60/100
Epoch 60: val_loss did not improve from 0.63094
Epoch 61/100
Epoch 61: val_loss improved from 0.63094 to 0.62490, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 62/100
Epoch 62: val_loss did not improve from 0.62490
Epoch 63/100
Epoch 63: val_loss did not improve from 0.62490
Epoch 64/100
Epoch 64: val_loss did not improve from 0.62490
Epoch 65/100
Epoch 65: val_loss improved from 0.62490 to 0.61889, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Mode

Epoch 81/100
Epoch 81: val_loss did not improve from 0.61147
Epoch 82/100
Epoch 82: val_loss did not improve from 0.61147
Epoch 83/100
Epoch 83: val_loss did not improve from 0.61147
Epoch 84/100
Epoch 84: val_loss did not improve from 0.61147
Epoch 85/100
Epoch 85: val_loss did not improve from 0.61147
Epoch 86/100
Epoch 86: val_loss did not improve from 0.61147
Epoch 87/100
Epoch 87: val_loss did not improve from 0.61147
Epoch 88/100
Epoch 88: val_loss did not improve from 0.61147
Epoch 89/100
Epoch 89: val_loss improved from 0.61147 to 0.59857, saving model to /Users/priyankar83/Documents/Learning/Data_Science_Learning/Deep_Learning/Audio_Classification/Model/audio_classification.hdf5
Epoch 90/100
Epoch 90: val_loss did not improve from 0.59857
Epoch 91/100
Epoch 91: val_loss did not improve from 0.59857
Epoch 92/100
Epoch 92: val_loss did not improve from 0.59857
Epoch 93/100
Epoch 93: val_loss did not improve from 0.59857
Epoch 94/100
Epoch 94: val_loss did not improve from 0.5985

In [39]:
accuracy = model.evaluate(X_test, y_test, verbose = 0)
print(accuracy[1])

0.8093875050544739
