## Importing libraries

In [1]:
import librosa
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
from tqdm import tqdm

Using TensorFlow backend.


## Create labels
Input: Folder Path   
Output: Tuple (Label, Indices of the labels, one-hot encoded labels)

In [2]:
DATA_PATH = "./eng_data/"

def get_labels(path=DATA_PATH):
    labels = os.listdir(path)
    label_indices = np.arange(0, len(labels))
    return labels, label_indices, to_categorical(label_indices)

## Converting wave to mfcc
Input: path of file, maximum pad length(default=11)   
Output: list of mfcc vectors

In [3]:
def wav2mfcc(file_path, max_len=11):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    wave = wave[::3]
    try:
        mfcc = librosa.feature.mfcc(wave, sr=16000)
    except Exception as e:
        print(file_path)
        print(e)
        
    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

## Saving mfcc vectors to .npy files
Input: path to write files   

In [4]:
def save_data_to_array(path=DATA_PATH, max_len=11):
    labels, _, _ = get_labels(path)

    for label in labels:
        mfcc_vectors = []
        wavfiles = [path + label + '/' + wavfile for wavfile in os.listdir(path + '/' + label)]
        
        for wavfile in tqdm(wavfiles, "Saving vectors of label - '{}'".format(label)):
            mfcc = wav2mfcc(wavfile, max_len=max_len)
            mfcc_vectors.append(mfcc)
            
        np.save(label + '.npy', mfcc_vectors)

## Dividing data into training set and testing set
Input: split ratio (default=0.8)    
Output: 2 arrays of training and testing

In [None]:
def get_train_test(split_ratio=0.8):
    # Get available labels
    labels, indices, _ = get_labels(DATA_PATH)

    # Getting first arrays
    X = np.load(labels[0] + '.npy')
    y = np.zeros(X.shape[0])

    # Append all of the dataset into one single array, same goes for y
    for i, label in enumerate(labels[1:]):
        x = np.load(label + '.npy')
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i+1)))

    assert X.shape[0] == len(y)

    return train_test_split(X, y, test_size= (1 - split_ratio), shuffle=True)

## Initialization by calling the necessay functions

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical

# Second dimension of the feature is dim2
feature_dim_2 = 11

# Save data to array file first (Comment next line when npy files are created to save time)
# save_data_to_array(max_len=feature_dim_2)

# Loading train set and test set
X_train, X_test, y_train, y_test = get_train_test()

# Feature dimension
feature_dim_1 = 20
channel = 1
epochs = 50
batch_size = 100
verbose = 1
num_classes = 10

# Reshaping to perform 2D convolution
X_train = X_train.reshape(X_train.shape[0], feature_dim_1, feature_dim_2, channel)
X_test = X_test.reshape(X_test.shape[0], feature_dim_1, feature_dim_2, channel)

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)

## Creating the model

In [None]:
def get_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(feature_dim_1, feature_dim_2, channel)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.1))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adadelta(),
                  metrics=['accuracy'])
    return model

In [None]:
# Predicts a sample
def predict(filepath, model):
    sample = wav2mfcc(filepath)
    sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel)
    return get_labels()[0][
            np.argmax(model.predict(sample_reshaped))
    ]

## Starting the training process

In [None]:
model = get_model()
model.fit(X_train, y_train_hot, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(X_test, y_test_hot))

Train on 18945 samples, validate on 4737 samples
Epoch 1/50
Epoch 2/50

## Model visualized

In [None]:
model.summary()

In [None]:
# A nice method to check all the attributes of an object
from pprint import pprint
pprint(vars(model))

## Prediction

In [None]:
print(predict('./eng_data/left/1cc80e39_nohash_0.wav', model=model))

# Testing the model in realtime

In [None]:
import sounddevice as sd
import time

## Converting the recorded array to mfcc
Instead of recording speech into a file and reading from it to create an array, this function would directly convert the recorded array into mfcc coefficients.

In [None]:
def array2mfcc(wave, max_len=11):
    sr = 16000
    wave = wave[::3]
    mfcc = librosa.feature.mfcc(wave, sr=16000)

    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]
    
    return mfcc

In [None]:
## For reference: These are the words trained to the classifier 
get_labels()[0]

## Recording...
Please speak after executing the next cell.   

In [None]:
duration = 1  # seconds
fs = 16000
new_sample = sd.rec(int(duration * fs), samplerate=fs, channels=1)
time.sleep(duration)

In [None]:
sd.play(new_sample, fs)

In [None]:
new_sample = new_sample.reshape(-1)
sample = array2mfcc(new_sample)
sample_reshaped = sample.reshape(1,20,11,1)

print(get_labels()[0][np.argmax(model.predict(sample_reshaped))])

In [None]:
..stop..

## Saving classifier

In [None]:
model_json = model.to_json()

In [None]:
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
model.save_weights("model.h5")

## Loading classifier

In [None]:
from keras.models import model_from_json
# load json and create model
json_file = open('model.json', 'r')
model_json = json_file.read()
json_file.close()
model = model_from_json(model_json)
# load weights into new model
model.load_weights("model.h5")
print("Loaded model from disk")