## **KNN Model for Single Audio Event Detection**


In [None]:
import numpy 
import os
import pandas as pd
import librosa
import librosa.display
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from keras.preprocessing import image
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Initialize all path variables
dir_path = '/content/drive/MyDrive/Audio_Classification-MLSP'
train_data_path=os.path.join(dir_path, "train")

In [None]:
# load data from 'annotations.csv' 
data=pd.read_csv(os.path.join(dir_path, "annotations.csv"))

### **Preprocessing data**

In [None]:
#List containing all spectrograms
mel_spectrograms=[] 

for file in data['fname']:
  arr=numpy.load(os.path.join(train_data_path, file))
  m,n,o=arr.shape
  print(m, n, o)
  arr.resize(n,o)
  print(arr.shape)
  # Convert a power spectrogram (amplitude squared) to decibel (dB) units
  mel_spectrogram = librosa.power_to_db(arr, ref=numpy.max)
  # mfcc = librosa.features.mfcc(mel_spect)
  mel_spectrograms.append(mel_spectrogram)

1 128 314
(128, 314)
1 128 2504
(128, 2504)
1 128 236
(128, 236)
1 128 1070
(128, 1070)
1 128 272
(128, 272)
1 128 70
(128, 70)
1 128 664
(128, 664)
1 128 1421
(128, 1421)
1 128 432
(128, 432)
1 128 200
(128, 200)
1 128 1211
(128, 1211)
1 128 2512
(128, 2512)
1 128 52
(128, 52)
1 128 868
(128, 868)
1 128 106
(128, 106)
1 128 34
(128, 34)
1 128 2512
(128, 2512)
1 128 2584
(128, 2584)
1 128 72
(128, 72)
1 128 32
(128, 32)
1 128 44
(128, 44)
1 128 194
(128, 194)
1 128 100
(128, 100)
1 128 224
(128, 224)
1 128 1506
(128, 1506)
1 128 269
(128, 269)
1 128 518
(128, 518)
1 128 586
(128, 586)
1 128 100
(128, 100)
1 128 194
(128, 194)
1 128 41
(128, 41)
1 128 359
(128, 359)
1 128 282
(128, 282)
1 128 94
(128, 94)
1 128 314
(128, 314)
1 128 1701
(128, 1701)
1 128 240
(128, 240)
1 128 55
(128, 55)
1 128 2512
(128, 2512)
1 128 1114
(128, 1114)
1 128 230
(128, 230)
1 128 47
(128, 47)
1 128 272
(128, 272)
1 128 309
(128, 309)
1 128 259
(128, 259)
1 128 1512
(128, 1512)
1 128 2357
(128, 2357)
1 128 6

In [None]:
# The data provided has constant frequency scale in spectrogram.
# But, the duration(time) is different for all the provided samples.
# Hence, it is necessary to pad the data

# get the max_length of spectrograms in the time axis
max_duration=0
for spec in mel_spectrograms:
  m,n=spec.shape
  if max_duration<n:
      max_duration=n

# add padding in the given .npy files
features=[]
for spec in mel_spectrograms:
  mat=numpy.pad(spec, [(0, 0), (0, max_duration-spec[0].size)], mode='constant', constant_values=0)
  mat=mat.reshape((mat.shape[0], mat.shape[1], 1))
  features.append(mat)

In [None]:
data['label'].describe()

count     1000
unique      10
top       Bark
freq       100
Name: label, dtype: object

In [None]:
# need to convert training data into tensor datatype
i=0
features_t = []
for _ in features:
    features_t.append(tf.convert_to_tensor(features[i].flatten()))
    i=i+1

In [None]:
# convert features list into numpy.ndarray type
features_t=numpy.array(features_t)

### **Splitting Dataset into training and validation**

In [None]:
X=features_t
y=data['label']


# used 'random_state' of 40 while splitting to get the balanced split of data 
# Spliited in 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40, shuffle="true")

In [None]:
# get the count of classes present in y_test
print(type(y_test))
val_label=pd.Series(list(y_test))
val_label.value_counts()

<class 'pandas.core.series.Series'>


Bark                                     22
Walk_and_footsteps                       22
Doorbell                                 22
Crying_and_sobbing                       21
Siren                                    20
Knock                                    20
Vehicle_horn_and_car_horn_and_honking    20
Microwave_oven                           19
Shatter                                  17
Meow                                     17
dtype: int64

### **Convert label data (y_train & y_test) into 'one-hot vector' format**

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

train_encoder = LabelEncoder()
train_ = train_encoder.fit_transform(y_train)
to_onehot_train=to_categorical(train_)

test_encoder = LabelEncoder()
test_ = test_encoder.fit_transform(y_test)
to_onehot_test=to_categorical(test_)

y_train = to_onehot_train
y_test = to_onehot_test


Definitions for recall, precision and f1 metrics

In [None]:
# reference : https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
from keras import backend 
from sklearn.metrics import precision_score , recall_score
def recall_m(y_true, y_pred):
    true_positives = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)))
    possible_positives = backend.sum(backend.round(backend.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + backend.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)))
    predicted_positives = backend.sum(backend.round(backend.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + backend.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+backend.epsilon())) 

### **KNN Model**

**GridSearchCV()**: GridSearchCV is a technique for finding the optimal parameter values from a given set of parameters in a grid. It's essentially a cross-validation technique. The model as well as the parameters must be entered. After extracting the best parameter values, predictions are made.

In [None]:
grid_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
    # 'metric': ['euclidean']
}
print(X_train.shape)
model = GridSearchCV(KNeighborsClassifier(),
                        param_grid = grid_params,
                        scoring = 'accuracy', 
                        cv = 5, 
                        verbose = 1,
                        n_jobs = -1)

(800, 330752)


### **Training**

In [None]:
model.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean'], 'n_neighbors': [3],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy', verbose=1)

In [None]:
import pickle
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))


# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

### **Evaluation Metrics**

In [None]:
y_pred = model.best_estimator_.predict(X_test)
accuracy = model.score(X_test, y_test)
print('Accuracy:', accuracy)
precision = precision_m(y_test, y_pred)
print('Precision:', precision)
recall = recall_m(y_test, y_pred)
print('Recall:', recall)
f1_score = f1_m(y_test, y_pred)
print('F1 score:', f1_score)

Accuracy: 0.615
Precision: tf.Tensor(0.7278106, shape=(), dtype=float32)
Recall: tf.Tensor(0.615, shape=(), dtype=float32)
F1 score: tf.Tensor(0.6666666, shape=(), dtype=float32)


In [None]:
model.best_estimator_

KNeighborsClassifier(metric='euclidean', n_neighbors=3, weights='distance')

In [None]:
model.best_params_

{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

Confusion Matrix

In [None]:
y_test_mod = [numpy.where(r==1)[0][0] for r in y_test]
print(y_test_mod)

[7, 0, 1, 5, 0, 0, 6, 4, 6, 9, 4, 1, 7, 9, 6, 5, 6, 4, 5, 0, 7, 6, 9, 6, 9, 9, 3, 4, 7, 0, 8, 9, 9, 8, 7, 9, 3, 8, 5, 7, 3, 5, 2, 1, 9, 6, 1, 0, 3, 1, 9, 1, 8, 4, 6, 8, 4, 7, 5, 1, 7, 1, 5, 2, 9, 0, 4, 3, 3, 8, 4, 8, 4, 3, 9, 4, 1, 0, 5, 1, 9, 6, 2, 8, 6, 6, 9, 2, 6, 0, 8, 4, 7, 7, 6, 0, 2, 9, 7, 3, 5, 8, 3, 6, 1, 0, 8, 8, 5, 7, 0, 7, 9, 2, 0, 4, 1, 2, 1, 0, 2, 6, 5, 3, 0, 8, 7, 3, 7, 8, 3, 4, 1, 2, 9, 4, 0, 2, 8, 8, 2, 1, 7, 1, 0, 3, 1, 2, 3, 1, 9, 7, 0, 2, 7, 1, 8, 5, 8, 5, 3, 5, 0, 0, 2, 9, 3, 2, 9, 3, 2, 0, 3, 5, 6, 3, 6, 3, 7, 2, 0, 2, 7, 1, 9, 2, 8, 1, 2, 4, 2, 4, 4, 5, 2, 5, 8, 5, 5, 9]


In [None]:
y_pred_mod = [numpy.where(r==1)[0] for r in y_pred]
temp = []
for i in y_pred_mod:
  if(len(i)==0):
    temp.append(10)
  else:
    temp.append(i[0])
y_pred_mod = temp

In [None]:
print('200 test samples\n', confusion_matrix(y_test_mod, y_pred_mod))

200 test samples
 [[11  0  1  0  0  1  1  1  3  1  3]
 [ 0 17  0  0  0  1  0  1  0  1  1]
 [ 0  0 15  0  0  0  2  2  0  0  3]
 [ 0  0  1 10  0  1  1  1  1  0  5]
 [ 0  0  0  0 11  1  0  0  0  0  5]
 [ 0  0  1  1  0 14  1  0  2  0  0]
 [ 0  0  1  0  2  1  8  0  0  0  5]
 [ 0  0  0  0  2  0  0 18  0  0  0]
 [ 0  0  1  0  0  1  0  1 13  0  4]
 [ 0  1  1  3  2  0  0  1  3  6  5]
 [ 0  0  0  0  0  0  0  0  0  0  0]]
