In [2]:
import os
import io
import json
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
#Loads training dataset from json file
with open("data.json", "r") as fp:
    data = json.load(fp)

X = np.array(data["MFCCs"])
y = np.array(data["labels"])

In [4]:
# create train, validation, test split
train_img, test_img, train_label, test_label = train_test_split(X, y, test_size=0.2)
train_img, validation_img, train_label, validation_label = train_test_split(train_img, train_label, test_size=0.2)

In [5]:
# add an axis to nd array
train_img = train_img[..., np.newaxis]
test_img = test_img[..., np.newaxis]
validation_img = validation_img[..., np.newaxis]

In [6]:
input_shape = (train_img.shape[1], train_img.shape[2], 1)

In [7]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu", input_shape=input_shape))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(1,1), padding='same'))
model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Conv2D(128, kernel_size=(3,3), activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Flatten()) # Flattening the 2D arrays for fully connected layers
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(10,activation="softmax"))
# print model parameters on console
#model.summary()

In [8]:
# compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

#train the model
model.fit(train_img, train_label, epochs=10 , validation_data=(validation_img, validation_label))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a189c1fd00>

In [9]:
# evaluate network on test set
test_loss, test_acc = model.evaluate(test_img, test_label)
print("loss: " , test_loss)
print("accuracy: ", test_acc)

loss:  0.3057601749897003
accuracy:  0.9167253375053406


In [10]:
#make prediction
file_path = "up.wav"
signal, sample_rate = librosa.load(file_path)


In [11]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]

            # extract MFCCs
            MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
MGCCs = MFCCs.T

# we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

In [12]:
mapping = np.array(data["mapping"])
print(mapping)

['down' 'go' 'left' 'no' 'off' 'on' 'right' 'stop' 'up' 'yes']


In [13]:
# get the predicted label
predictions = model.predict(MFCCs)[0]
print(predictions)
predicted_index = np.argmax(predictions)
predicted_keyword = mapping[predicted_index]
print(predicted_keyword)

[8.41170400e-02 1.69087184e-06 3.84402949e-15 1.14383267e-10
 1.28914740e-10 9.15866077e-01 5.50265172e-12 1.48137133e-05
 3.15191841e-07 1.09869884e-13]
on


In [14]:
res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
print(str(res))

{'down': 8.41, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 91.59, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}


In [53]:
file_path = "speech_test/off.wav"
signal, sample_rate = librosa.load(file_path)


In [54]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]
           # extract MFCCs
MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
MGCCs = MFCCs.T

# we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

In [59]:
# get the predicted label
predictions = model.predict(MFCCs)[0]
print(predictions)
predicted_index = np.argmax(predictions)
predicted_keyword = mapping[predicted_index]
print(predicted_keyword)

[3.3704596e-04 4.6765595e-06 4.5452747e-13 2.0949321e-10 1.0525706e-06
 9.9965239e-01 2.6351923e-10 1.0062279e-06 3.9269908e-06 4.2629064e-14]
on


In [60]:
res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
print(str(res))

{'down': 0.03, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 99.97, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}


In [67]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
path = "speech_test"
for i in os.listdir(path):
    #file_path = "speech_test/off.wav"
    signal, sample_rate = librosa.load(path+ '//' +i)
    print(path+'/'+i)
    if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]
           # extract MFCCs
    MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
    MGCCs = MFCCs.T

    # we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
    MFCCs = MFCCs[np.newaxis, ..., np.newaxis]
    # get the predicted label
    predictions = model.predict(MFCCs)[0]
    #print(predictions)
    res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
    print(str(res))
    predicted_index = np.argmax(predictions)
    predicted_keyword = mapping[predicted_index]
    print("predicted keyword: ", predicted_keyword)
    print("\n")

speech_test/down.wav
{'down': 99.65, 'go': 0.02, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 0.3, 'right': 0.0, 'stop': 0.02, 'up': 0.0, 'yes': 0.0}
predicted keyword:  down


speech_test/go.wav
{'down': 3.4, 'go': 0.19, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 96.35, 'right': 0.0, 'stop': 0.03, 'up': 0.03, 'yes': 0.0}
predicted keyword:  on


speech_test/left.wav
{'down': 0.06, 'go': 99.78, 'left': 0.0, 'no': 0.16, 'off': 0.0, 'on': 0.0, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}
predicted keyword:  go


speech_test/no.wav
{'down': 0.55, 'go': 31.07, 'left': 5.52, 'no': 16.69, 'off': 9.89, 'on': 0.56, 'right': 0.2, 'stop': 1.11, 'up': 34.17, 'yes': 0.23}
predicted keyword:  up


speech_test/off.wav
{'down': 0.03, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 0.0, 'on': 99.97, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}
predicted keyword:  on


speech_test/on.wav
{'down': 21.51, 'go': 3.67, 'left': 0.0, 'no': 0.22, 'off': 0.1, 'on': 69.22, 'right': 0.01, 'stop': 0.14, 'up': 5.1