In [1]:
import os
import io
import json
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#Loads training dataset from json file
with open("data.json", "r") as fp:
    data = json.load(fp)

X = np.array(data["MFCCs"])
y = np.array(data["labels"])

In [3]:
# create train, validation, test split

train_img, test_img, train_label, test_label = train_test_split(X, y, test_size=0.2)
train_img, validation_img, train_label, validation_label = train_test_split(train_img, train_label, test_size=0.2)

In [4]:
# add an axis to nd array
train_img = train_img[..., np.newaxis]
test_img = test_img[..., np.newaxis]
validation_img = validation_img[..., np.newaxis]

In [5]:
input_shape = (train_img.shape[1], train_img.shape[2], 1)
print(input_shape)

(44, 20, 1)


In [6]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu", input_shape=input_shape))
model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu"))
model.add(tf.keras.layers.MaxPooling2D((3, 3), strides=(2,2), padding='same'))
model.add(tf.keras.layers.Conv2D(128, kernel_size=(3,3), activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Flatten()) # Flattening the 2D arrays for fully connected layers
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(10,activation="softmax"))
# print model parameters on console
#model.summary()

In [7]:
# compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

#train the model
model.fit(train_img, train_label, epochs=30 , validation_data=(validation_img, validation_label))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2a7b59862b0>

In [8]:
# evaluate network on test set
test_loss, test_acc = model.evaluate(test_img, test_label)
print("loss: " , test_loss)
print("accuracy: ", test_acc)

loss:  0.3576222360134125
accuracy:  0.9216514229774475


In [9]:
mapping = np.array(data["mapping"])
print(mapping)

['down' 'go' 'left' 'no' 'off' 'on' 'right' 'stop' 'up' 'yes']


In [10]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
path = "speech_test"
for i in os.listdir(path):
    #file_path = "speech_test/off.wav"
    signal, sample_rate = librosa.load(path+ '//' +i)
    print(path+'/'+i)
    if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]
           # extract MFCCs
    MFCCs = librosa.feature.mfcc(signal, sample_rate)
    MFCCs = MFCCs.reshape(44,20)        
    # we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
    MFCCs = MFCCs[np.newaxis, ... , np.newaxis]

    # get the predicted label
    predictions = model.predict(MFCCs)[0]
    #print(predictions)
    res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
    print(str(res))
    predicted_index = np.argmax(predictions)
    predicted_keyword = mapping[predicted_index]
    print("predicted keyword: ", predicted_keyword)
    print("\n")

speech_test/down.wav
{'down': 0.0, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 100.0, 'up': 0.0, 'yes': 0.0}
predicted keyword:  stop


speech_test/go.wav
{'down': 100.0, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}
predicted keyword:  down


speech_test/left.wav
{'down': 99.56, 'go': 0.18, 'left': 0.0, 'no': 0.02, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.24, 'up': 0.0, 'yes': 0.0}
predicted keyword:  down


speech_test/no.wav
{'down': 97.74, 'go': 0.27, 'left': 0.0, 'no': 1.97, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.02, 'up': 0.0, 'yes': 0.0}
predicted keyword:  down


speech_test/off.wav
{'down': 0.0, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.17, 'on': 0.0, 'right': 0.0, 'stop': 99.83, 'up': 0.0, 'yes': 0.0}
predicted keyword:  stop


speech_test/on.wav
{'down': 0.0, 'go': 0.02, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.62, 'up': 99.36, 'yes'