# Audio Classification 

Dataset here [https://urbansounddataset.weebly.com/](https://urbansounddataset.weebly.com/)

## Data Preprocessing

### Features Extraction


In [1]:
# Pkgs loading
import pandas as pd
import os
import librosa
import numpy as np
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
audio_dataset_path='/content/drive/MyDrive/UrbanSound8K/audio'
metadata=pd.read_csv('/content/drive/MyDrive/UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [4]:
# Dataset Balancing/Imbalancing Check
metadata['class'].value_counts()

street_music        1000
air_conditioner     1000
engine_idling       1000
jackhammer          1000
drilling            1000
dog_bark            1000
children_playing    1000
siren                929
car_horn             429
gun_shot             374
Name: class, dtype: int64

In [5]:
# Extracting MFCC's For every audio file
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features

In [6]:
# Features extraction from all audio files (MFCC)
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
8732it [2:10:01,  1.12it/s]


In [8]:
# Converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-215.79301, 71.66612, -131.81377, -52.09133, ...",dog_bark
1,"[-424.68677, 110.56227, -54.14824, 62.01074, -...",children_playing
2,"[-459.56467, 122.80034, -47.92471, 53.265705, ...",children_playing
3,"[-414.55377, 102.896904, -36.66495, 54.18041, ...",children_playing
4,"[-447.397, 115.0954, -53.809113, 61.60859, 1.6...",children_playing


In [9]:
# Data Frame Saving
extracted_features_df.to_csv("UrbanSound8K_DF.csv")

### Data Splitting and encoding

In [60]:
# Data Splitting
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [61]:
X.shape

(8732, 40)

In [62]:
y

array(['dog_bark', 'children_playing', 'children_playing', ...,
       'car_horn', 'car_horn', 'car_horn'], dtype='<U16')

In [63]:
# Label Encoding
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [64]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [65]:
# Training Testing Sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [66]:
X_train

array([[-1.3183614e+02,  1.1397464e+02, -2.3956861e+01, ...,
         3.3314774e+00, -1.4786109e+00,  2.8736603e+00],
       [-1.4074220e+01,  9.1916939e+01, -8.6787224e+00, ...,
        -3.3844025e+00, -5.2119040e+00, -1.5936136e+00],
       [-4.9532028e+01,  1.5521857e-01, -2.0369110e+01, ...,
         2.0491767e+00, -8.0537486e-01,  2.7793026e+00],
       ...,
       [-4.2699329e+02,  9.2890648e+01,  3.0233374e+00, ...,
         8.6335999e-01,  6.4766812e-01,  7.8490508e-01],
       [-1.4607024e+02,  1.3709459e+02, -3.4298344e+01, ...,
         1.3777871e+00, -1.9530845e+00, -8.9652115e-01],
       [-4.2167450e+02,  2.1169034e+02,  2.6820309e+00, ...,
        -5.1484947e+00, -3.6400862e+00, -1.3321607e+00]], dtype=float32)

In [67]:
X_train.shape

(6985, 40)

In [68]:
X_test.shape

(1747, 40)

In [69]:
y_train.shape

(6985, 10)

In [70]:
y_test.shape

(1747, 10)

### Model Creation

In [71]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [72]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [73]:
# No of classes
num_labels=y.shape[1]
print(num_labels)

10


In [78]:
model=Sequential()
#first layer
model.add(Dense(3504,input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#second layer
model.add(Dense(1752))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#third layer
model.add(Dense(876))
model.add(Activation('relu'))
model.add(Dropout(0.5))

#final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [79]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 3504)              143664    
_________________________________________________________________
activation_8 (Activation)    (None, 3504)              0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 3504)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 1752)              6140760   
_________________________________________________________________
activation_9 (Activation)    (None, 1752)              0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 1752)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 876)              

In [81]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [82]:
# Model training
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

In [83]:
num_epochs = 100
num_batch_size = 128

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.h5', verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.66232, saving model to saved_models/audio_classification.h5
Epoch 2/100

Epoch 00002: val_loss improved from 1.66232 to 1.43536, saving model to saved_models/audio_classification.h5
Epoch 3/100

Epoch 00003: val_loss improved from 1.43536 to 1.29780, saving model to saved_models/audio_classification.h5
Epoch 4/100

Epoch 00004: val_loss improved from 1.29780 to 1.19516, saving model to saved_models/audio_classification.h5
Epoch 5/100

Epoch 00005: val_loss improved from 1.19516 to 1.07944, saving model to saved_models/audio_classification.h5
Epoch 6/100

Epoch 00006: val_loss improved from 1.07944 to 0.98812, saving model to saved_models/audio_classification.h5
Epoch 7/100

Epoch 00007: val_loss improved from 0.98812 to 0.92731, saving model to saved_models/audio_classification.h5
Epoch 8/100

Epoch 00008: val_loss improved from 0.92731 to 0.83431, saving model to saved_models/audio_classification.h5
Epoch 9/100

Epoch 00009: va

In [84]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.923297107219696


### Testing Some NEW Audio Data

In [94]:
filename="/content/Dog-Bark.wav"
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast') 
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

In [95]:
mfccs_scaled_features.shape

(40,)

In [100]:
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features.shape)
predicted_label = np.argmax(model.predict(mfccs_scaled_features), axis=-1)
print('Predicted Label:',predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label) 
prediction_class[0]

(1, 40)
Predicted Label: [3]


'dog_bark'