## MFCC feature extraction and Network training

In this notebook you will go through an example flow of processing audio data, complete with feature extraction and training.

Make sure you read the instructions on the exercise sheet and follow the task order.

#### Task 1 

In [None]:
import json
import numpy as np
from scipy.io import wavfile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, regularizers
from tqdm import tqdm

assert(tf.__version__ == "2.1.0")
assert(tf.executing_eagerly() == True)

DataSetPath = "../hey_snips_kws_4.0/hey_snips_research_6k_en_train_eval_clean_ter/"

with open(DataSetPath+"train.json") as jsonfile:
    traindata = json.load(jsonfile)

with open(DataSetPath+"test.json") as jsonfile:
    testdata = json.load(jsonfile)

#### Task 2

In [None]:
def load_data():
    x_train_list = []
    y_train_list = []

    x_test_list = []
    y_test_list = []

    totalSliceLength = 10 # Length to stuff the signals to, given in seconds

    # trainsize = len(traindata) # Number of loaded training samples
    # testsize = len(testdata) # Number of loaded testing samples

    trainsize = 1000 # Number of loaded training samples
    testsize = 100 # Number of loaded testing samples


    fs = 16000 # Sampling rate of the samples
    segmentLength = 1024 # Number of samples to use per segment

    sliceLength = int(totalSliceLength * fs / segmentLength)*segmentLength

    for i in tqdm(range(trainsize)): 
        fs, train_sound_data = wavfile.read(DataSetPath+traindata[i]['audio_file_path']) # Read wavfile to extract amplitudes

        _x_train = train_sound_data.copy() # Get a mutable copy of the wavfile
        _x_train.resize(sliceLength) # Zero stuff the single to a length of sliceLength
        _x_train = _x_train.reshape(-1,int(segmentLength)) # Split slice into Segments with 0 overlap
        x_train_list.append(_x_train.astype(np.float32)) # Add segmented slice to training sample list, cast to float so librosa doesn't complain
        y_train_list.append(traindata[i]['is_hotword']) # Read label 

    for i in tqdm(range(testsize)):
        fs, test_sound_data = wavfile.read(DataSetPath+testdata[i]['audio_file_path'])
        _x_test = test_sound_data.copy()
        _x_test.resize(sliceLength)
        _x_test = _x_test.reshape((-1,int(segmentLength)))
        x_test_list.append(_x_test.astype(np.float32))
        y_test_list.append(testdata[i]['is_hotword'])

    x_train = tf.convert_to_tensor(np.asarray(x_train_list))
    y_train = tf.convert_to_tensor(np.asarray(y_train_list))

    x_test = tf.convert_to_tensor(np.asarray(x_test_list))
    y_test = tf.convert_to_tensor(np.asarray(y_test_list))

    return x_train, y_train, x_test, y_test

In [None]:
def compute_mfccs(tensor):
    sample_rate = 16000.0
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
    frame_length = 1024
    num_mfcc = 13

    stfts = tf.signal.stft(tensor, frame_length=frame_length, frame_step=frame_length, fft_length=frame_length)
    spectrograms = tf.abs(stfts)
    spectrograms = tf.reshape(spectrograms, (spectrograms.shape[0],spectrograms.shape[1],-1))
    num_spectrogram_bins = stfts.shape[-1]
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
      num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
      upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :num_mfcc]
    return tf.reshape(mfccs, (mfccs.shape[0],mfccs.shape[1],mfccs.shape[2],-1))

In [None]:
x_train, y_train, x_test, y_test = load_data()

#### Task 3

In [None]:
x_train_mfcc = compute_mfccs(x_train)
x_test_mfcc = compute_mfccs(x_test)

print(x_train_mfcc.shape)
print(x_test_mfcc.shape)

#### Task 4

In [None]:
batchSize = 10
epochs = 30

train_set = (x_train_mfcc/512 + 0.5)
train_labels = y_train

test_set = (x_test_mfcc/512 + 0.5)
test_labels = y_test

model = tf.keras.models.Sequential()


model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model.fit(train_set, y_train, batchSize, epochs)


In [None]:
model.summary()
model.evaluate(test_set, y_test)

In [None]:
model.save("MFCCmodel.h5")

#### NNoM Extract

In [1]:
import numpy as np
from nnom_utils import * 
import tensorflow as tf
from tensorflow import keras

npzfile = np.load("../../data_nnom/2class_mm_intra_test_split1.npz")
x_test, y_test = npzfile['X_Train'], npzfile['y_Train']
    
model = keras.models.load_model("../../model_nnom/global_class_2_ds3_nch38_T1_split_1_v1.h5")
generate_model(model, x_test, format='hwc', name="weights.h" )

Instructions for updating:
non-resource variables are not supported in the long term
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
input_1 max value: 1.0 min value: 0.0 dec bit 7
KLD loss [8.512857792631207, 0.05423308028116321, 3.5350372574248023, 10.68689804271818]
KLD shift [2, 3, 4, 5]
conv2d_1 is using KLD method, original shift 2 KLD results 3
conv2d_1 max value: 18.657698 min value: -17.214493 dec bit 3
KLD loss [1.8572619425097299, 0.03126832680802451, 0.2883232057864603, 1.8431031639186313]
KLD shift [3, 4, 5, 6]
batch_normalization_1 is using KLD method, original shift 3 KLD results 4
batch_normalization_1 max value: 9.7006035 min value: -14.863518 dec bit 4
KLD loss [3.346329867044317, 0.2793355000830491, 1.250332911578891, 2.6988595963244317]
KLD shift [3, 4, 5, 6]
depthwise_conv2d_1 is using KLD method, original shift 3 KLD results 4
depthwise_conv2d_1 max value: 15.94808 min value: -14.485238 dec bit 4
KLD loss [8.719743910656735, 0.0493

In [8]:
import numpy as np
from nnom_utils import * 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

npzfile = np.load("../../data_nnom/2class_mm_intra_test_split1.npz")
x_test, y_test = npzfile['X_Train'], npzfile['y_Train']
y_test = to_categorical(y_test)
    
model = keras.models.load_model("../../model_nnom/global_class_2_ds3_nch38_T1_split_1_v1.h5")
generate_test_bin(x_test*127, y_test, name='test_data.bin')
scores = evaluate_model(model,x_test, y_test)

binary test file generated: test_data.bin
test data length: 1470
1470/1470 - 0s - loss: 0.3678 - acc: 0.8422
Test loss: 0.3677783435299283
Top 1: 0.84217685
Top 2: [1. 1. 1. ... 1. 1. 1.]
[[624 111]
 [121 614]]
conv2d_1_1/kernel:0 Dec num: 6
batch_normalization_1_1/gamma:0 Dec num: 12
batch_normalization_1_1/beta:0 Dec num: 12
batch_normalization_1_1/moving_mean:0 Dec num: 12
batch_normalization_1_1/moving_variance:0 Dec num: 12
depthwise_conv2d_1_1/depthwise_kernel:0 Dec num: 7
batch_normalization_2_1/gamma:0 Dec num: 7
batch_normalization_2_1/beta:0 Dec num: 7
batch_normalization_2_1/moving_mean:0 Dec num: 7
batch_normalization_2_1/moving_variance:0 Dec num: 7
separable_conv2d_1_1/depthwise_kernel:0 Dec num: 6
separable_conv2d_1_1/pointwise_kernel:0 Dec num: 6
batch_normalization_3_1/gamma:0 Dec num: 8
batch_normalization_3_1/beta:0 Dec num: 8
batch_normalization_3_1/moving_mean:0 Dec num: 8
batch_normalization_3_1/moving_variance:0 Dec num: 8
dense_1/kernel:0 Dec num: 10
dense_1/bia

In [9]:
print(scores)

[0.3677783435299283, 0.84217685]
