In [1]:
import os
import io
import json
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#Loads training dataset from json file
with open("data.json", "r") as fp:
    data = json.load(fp)

X = np.array(data["MFCCs"])
y = np.array(data["labels"])

In [3]:
# create train, validation, test split
train_img, test_img, train_label, test_label = train_test_split(X, y, test_size=0.2)
train_img, validation_img, train_label, validation_label = train_test_split(train_img, train_label, test_size=0.2)

In [4]:
# add an axis to nd array
train_img = train_img[..., np.newaxis]
test_img = test_img[..., np.newaxis]
validation_img = validation_img[..., np.newaxis]

In [5]:
input_shape = (train_img.shape[1], train_img.shape[2], 1)

In [6]:
model = tf.keras.models.Sequential()

#1st conv layer
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=input_shape, kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))

# 2nd conv layer
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))

# 3rd conv layer
model.add(tf.keras.layers.Conv2D(32, (2, 2), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.MaxPooling2D((2, 2), strides=(2,2), padding='same'))

# flatten output and feed into dense layer
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64, activation='relu'))
tf.keras.layers.Dropout(0.3)

# softmax output layer
model.add(tf.keras.layers.Dense(10, activation='softmax'))

In [7]:
# compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

#train the model
model.fit(train_img, train_label, epochs=25 , validation_data=(validation_img, validation_label))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x2ce0fea59d0>

In [8]:
# evaluate network on test set
test_loss, test_acc = model.evaluate(test_img, test_label)
print("loss: " , test_loss)
print("accuracy: ", test_acc)

loss:  0.4404069483280182
accuracy:  0.9108608961105347


In [9]:
#make prediction
file_path = "left.wav"
signal, sample_rate = librosa.load(file_path)


In [10]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]

            # extract MFCCs
            MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
MGCCs = MFCCs.T

# we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

In [11]:
mapping = np.array(data["mapping"])
print(mapping)

['down' 'go' 'left' 'no' 'off' 'on' 'right' 'stop' 'up' 'yes']


In [12]:
# get the predicted label
predictions = model.predict(MFCCs)[0]
print(predictions)
predicted_index = np.argmax(predictions)
predicted_keyword = mapping[predicted_index]
print(predicted_keyword)

[9.9976069e-01 4.0300108e-08 5.6684885e-28 2.3875410e-04 1.3880232e-25
 2.8547601e-18 5.8229201e-26 5.5689486e-07 9.4601727e-37 2.5568176e-23]
down


In [13]:
res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
print(str(res))

{'down': 99.98, 'go': 0.0, 'left': 0.0, 'no': 0.02, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}


In [14]:
file_path = "speech_test/down.wav"
signal, sample_rate = librosa.load(file_path)


In [15]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]
           # extract MFCCs
MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
MGCCs = MFCCs.T

# we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

In [16]:
# get the predicted label
predictions = model.predict(MFCCs)[0]
print(predictions)
predicted_index = np.argmax(predictions)
predicted_keyword = mapping[predicted_index]
print(predicted_keyword)

[1.0000000e+00 5.3615449e-16 1.0775522e-37 1.5427034e-08 0.0000000e+00
 0.0000000e+00 0.0000000e+00 9.8093265e-21 0.0000000e+00 1.7638609e-37]
down


In [17]:
res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
print(str(res))

{'down': 100.0, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}
