## Importing libraries

In [1]:
import librosa
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
from tqdm import tqdm

Using TensorFlow backend.


## Create labels
Input: Folder Path   
Output: Tuple (Label, Indices of the labels, one-hot encoded labels)

In [2]:
DATA_PATH = "./eng_data/"

def get_labels(path=DATA_PATH):
    labels = os.listdir(path)
    label_indices = np.arange(0, len(labels))
    return labels, label_indices, to_categorical(label_indices)

## Converting wave to mfcc
Input: path of file, maximum pad length(default=11)   
Output: list of mfcc vectors

In [3]:
def wav2mfcc(file_path, max_len=11):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    wave = wave[::3]
    try:
        mfcc = librosa.feature.mfcc(wave, sr=16000)
    except Exception as e:
        print(file_path)
        print(e)
        
    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

## Saving mfcc vectors to .npy files
Input: path to write files   

In [4]:
def save_data_to_array(path=DATA_PATH, max_len=11):
    labels, _, _ = get_labels(path)

    for label in labels:
        mfcc_vectors = []
        wavfiles = [path + label + '/' + wavfile for wavfile in os.listdir(path + '/' + label)]
        
        for wavfile in tqdm(wavfiles, "Saving vectors of label - '{}'".format(label)):
            mfcc = wav2mfcc(wavfile, max_len=max_len)
            mfcc_vectors.append(mfcc)
            
        np.save(label + '.npy', mfcc_vectors)

## Dividing data into training set and testing set
Input: split ratio (default=0.8)    
Output: 2 arrays of training and testing

In [5]:
def get_train_test(split_ratio=0.8):
    # Get available labels
    labels, indices, _ = get_labels(DATA_PATH)

    # Getting first arrays
    X = np.load(labels[0] + '.npy')
    y = np.zeros(X.shape[0])

    # Append all of the dataset into one single array, same goes for y
    for i, label in enumerate(labels[1:]):
        x = np.load(label + '.npy')
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i+1)))

    assert X.shape[0] == len(y)

    return train_test_split(X, y, test_size= (1 - split_ratio), shuffle=True)

## Initialization by calling the necessay functions

In [6]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical

# Second dimension of the feature is dim2
feature_dim_2 = 11

# Save data to array file first (Comment next line when npy files are created to save time)
# save_data_to_array(max_len=feature_dim_2)

# Loading train set and test set
X_train, X_test, y_train, y_test = get_train_test()

# Feature dimension
feature_dim_1 = 20
channel = 1
epochs = 20
batch_size = 100
verbose = 1
num_classes = 7

# Reshaping to perform 2D convolution
X_train = X_train.reshape(X_train.shape[0], feature_dim_1, feature_dim_2, channel)
X_test = X_test.reshape(X_test.shape[0], feature_dim_1, feature_dim_2, channel)

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)

## Creating the model

In [7]:
def get_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(feature_dim_1, feature_dim_2, channel)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(),
                  metrics=['accuracy'])
    return model

In [8]:
# Predicts a sample
def predict(filepath, model):
    sample = wav2mfcc(filepath)
    sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel)
    return get_labels()[0][
            np.argmax(model.predict(sample_reshaped))
    ]

## Starting the training process

In [9]:
model = get_model()
model.fit(X_train, y_train_hot, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(X_test, y_test_hot))

Train on 13256 samples, validate on 3314 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc21b576588>

## Model visualized

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 19, 10, 32)        160       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 18, 9, 48)         6192      
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 17, 8, 120)        23160     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 4, 120)         0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 8, 4, 120)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3840)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               491648    
__________

In [24]:
# A nice method to check all the attributes of an object
from pprint import pprint
pprint(vars(model))

{'_built': True,
 '_collected_trainable_weights': [<tf.Variable 'conv2d_1/kernel:0' shape=(2, 2, 1, 32) dtype=float32_ref>,
                                  <tf.Variable 'conv2d_1/bias:0' shape=(32,) dtype=float32_ref>,
                                  <tf.Variable 'conv2d_2/kernel:0' shape=(2, 2, 32, 48) dtype=float32_ref>,
                                  <tf.Variable 'conv2d_2/bias:0' shape=(48,) dtype=float32_ref>,
                                  <tf.Variable 'conv2d_3/kernel:0' shape=(2, 2, 48, 120) dtype=float32_ref>,
                                  <tf.Variable 'conv2d_3/bias:0' shape=(120,) dtype=float32_ref>,
                                  <tf.Variable 'dense_1/kernel:0' shape=(3840, 128) dtype=float32_ref>,
                                  <tf.Variable 'dense_1/bias:0' shape=(128,) dtype=float32_ref>,
                                  <tf.Variable 'dense_2/kernel:0' shape=(128, 64) dtype=float32_ref>,
                                  <tf.Variable 'dense_2/bias:0' 

## Prediction

In [25]:
print(predict('./eng_data/left/1cc80e39_nohash_0.wav', model=model))

left


## Confidence

In [26]:
def confidence(filepath, model):
    sample = wav2mfcc(filepath)
    sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel)
    u = model.predict(sample_reshaped)
#     print(u)
    return u, get_labels()[0][
            np.argmax(model.predict(sample_reshaped))
    ]

In [27]:
labs = get_labels()[0]

In [28]:
confs, word = confidence('./eng_data/left/1cc80e39_nohash_0.wav', model=model)
for lab,conf in zip(labs,confs[0]):
    print(lab , "    " ,conf)
print(word)

down      8.978884e-09
go      8.8394725e-10
left      0.99993
on      1.9132768e-08
right      6.860813e-05
up      4.0531023e-07
yes      9.4027564e-07
left


In [73]:
def confidence(new_sample, model):

    new_sample = new_sample.reshape(-1)
    # print("Ye: " + str(new_sample.shape))
    sample = array2mfcc(new_sample)
    sample_reshaped = sample.reshape(1,20,11,1)
    return model.predict(sample_reshaped)


def get_conf(inp, model):
    labs = get_labels()[0]
    inp = np.array(inp)
    confs = confidence(inp, model=model)
#     print("zzz")
#     print(confs[0])
#     print(labs)
    for lab,conf in zip(labs,confs[0]):
        print(lab , "    " ,conf)
    # print(word)
    
def get_conf2(sample, model):
    for a,b in zip(get_labels()[0], model.predict(sample)[0]):
        print(a, "     ", b)


In [72]:
# for i in model.predict(sample_reshaped)[0]:
#     print(i)
# model.predict(sample_reshaped)[0]

0.701204
0.037764013
0.004798433
0.08345175
0.00975744
0.007902366
0.15512204


array([0.701204  , 0.03776401, 0.00479843, 0.08345175, 0.00975744,
       0.00790237, 0.15512204], dtype=float32)

# Testing the model in realtime

In [29]:
import sounddevice as sd
import time

## Converting the recorded array to mfcc
Instead of recording speech into a file and reading from it to create an array, this function would directly convert the recorded array into mfcc coefficients.

In [30]:
def array2mfcc(wave, max_len=11):
    sr = 16000
    wave = wave[::3]
    mfcc = librosa.feature.mfcc(wave, sr=16000)

    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

In [31]:
## For reference: These are the words trained to the classifier 
get_labels()[0]

['down', 'go', 'left', 'on', 'right', 'up', 'yes']

## Recording...
Please speak after executing the next cell.   

In [168]:
duration = 1  # seconds
fs = 16000
new_sample = sd.rec(int(duration * fs), samplerate=fs, channels=1)
time.sleep(duration)
sd.play(new_sample, fs)

In [169]:
new_sample = new_sample.reshape(-1)
sample = array2mfcc(new_sample)
sample_reshaped = sample.reshape(1,20,11,1)

print(get_labels()[0][np.argmax(model.predict(sample_reshaped))])
get_conf2(sample_reshaped, model)

yes
down       0.09955799
go       0.0428727
left       0.15488596
on       0.007816722
right       0.012278808
up       0.0067787645
yes       0.675809


In [118]:
..stop..

SyntaxError: invalid syntax (<ipython-input-118-ebe35fe59f44>, line 1)

## Saving classifier

In [113]:
model_json = model.to_json()

In [114]:
with open("model7.json", "w") as json_file:
    json_file.write(model_json)

In [115]:
model.save_weights("model7.h5")

## Loading classifier

In [None]:
from keras.models import model_from_json
# load json and create model
json_file = open('model1.json', 'r')
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
# load weights into new model
model.load_weights("model1.h5")
print("Loaded model from disk")