In [1]:
import os
import io
import json
import librosa
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#Loads training dataset from json file
with open("data.json", "r") as fp:
    data = json.load(fp)

X = np.array(data["MFCCs"])
y = np.array(data["labels"])

In [3]:
# create train, validation, test split
train_img, test_img, train_label, test_label = train_test_split(X, y, test_size=0.2)
train_img, validation_img, train_label, validation_label = train_test_split(train_img, train_label, test_size=0.2)

In [4]:
# add an axis to nd array
train_img = train_img[..., np.newaxis]
test_img = test_img[..., np.newaxis]
validation_img = validation_img[..., np.newaxis]

In [5]:
input_shape = (train_img.shape[1], train_img.shape[2], 1)

In [6]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu", input_shape=input_shape))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(1,1), padding='same'))
model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Conv2D(128, kernel_size=(3,3), activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
model.add(tf.keras.layers.Flatten()) # Flattening the 2D arrays for fully connected layers
model.add(tf.keras.layers.Dense(128, activation="relu"))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(64, activation="relu"))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(10,activation="softmax"))
# print model parameters on console
#model.summary()

In [7]:
# compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

#train the model
model.fit(train_img, train_label, epochs=50 , validation_data=(validation_img, validation_label))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1d91ef22b80>

In [8]:
# evaluate network on test set
test_loss, test_acc = model.evaluate(test_img, test_label)
print("loss: " , test_loss)
print("accuracy: ", test_acc)

loss:  0.5868441462516785
accuracy:  0.9125029444694519


In [9]:
#make prediction
file_path = "up.wav"
signal, sample_rate = librosa.load(file_path)


In [10]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]

            # extract MFCCs
            MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
MGCCs = MFCCs.T

# we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

In [11]:
mapping = np.array(data["mapping"])
print(mapping)

['down' 'go' 'left' 'no' 'off' 'on' 'right' 'stop' 'up' 'yes']


In [12]:
# get the predicted label
predictions = model.predict(MFCCs)[0]
print(predictions)
predicted_index = np.argmax(predictions)
predicted_keyword = mapping[predicted_index]
print(predicted_keyword)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
off


In [13]:
res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
print(str(res))

{'down': 0.0, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 100.0, 'on': 0.0, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}


In [14]:
file_path = "speech_test/stop.wav"
signal, sample_rate = librosa.load(file_path)


In [15]:
SAMPLES_TO_CONSIDER = 22050 #samples in 1 sec
if len(signal) >= SAMPLES_TO_CONSIDER:
            # ensure consistency of the length of the signal
            signal = signal[:SAMPLES_TO_CONSIDER]
           # extract MFCCs
MFCCs = librosa.feature.mfcc(signal, sample_rate)
            
MGCCs = MFCCs.T

# we need a 4-dim array to feed to the model for prediction: (# samples, # time steps, # coefficients, 1)
MFCCs = MFCCs[np.newaxis, ..., np.newaxis]

In [16]:
# get the predicted label
predictions = model.predict(MFCCs)[0]
print(predictions)
predicted_index = np.argmax(predictions)
predicted_keyword = mapping[predicted_index]
print(predicted_keyword)

[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
off


In [17]:
res = {mapping[i]: round(predictions[i]*100,2) for i in range(len(mapping))}
print(str(res))

{'down': 0.0, 'go': 0.0, 'left': 0.0, 'no': 0.0, 'off': 100.0, 'on': 0.0, 'right': 0.0, 'stop': 0.0, 'up': 0.0, 'yes': 0.0}
