In [None]:
import os
import numpy as np
import keras
import librosa
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, Flatten, GlobalMaxPooling2D , GlobalMaxPooling1D, MaxPooling2D, MaxPooling1D, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPool2D, concatenate, Dense, Dropout
from keras.layers import  Conv2D, MaxPooling2D, UpSampling2D, Lambda, Reshape
from keras.layers import Input, GRU, RepeatVector, BatchNormalization, TimeDistributed, Conv1D
from keras.layers import GlobalAveragePooling1D, LSTM
from keras.optimizers import Adam
from tensorflow.python.keras.utils import to_categorical

from sklearn.metrics import classification_report,accuracy_score

from scipy.io import wavfile
from tqdm import tqdm
from pydub import AudioSegment


path = "data/train/audio"

In [None]:
def prepare_training_data(data, augmentation = False):
    if augmentation:
        np_folder = "data_np_mfcc_aug"
    else:
        np_folder = 'data_np_mfcc'
        
    labels_actual = 'yes no up down left right on off stop go silence unknown'.split()
    labels_actual.remove('right')
    labels = os.listdir(data)
    labels.remove('.DS_Store')
    labels.remove('_background_noise_')

    X = np.load(np_folder+ '/' + labels[0] + '.npy')
    y = np.zeros(X.shape[0])
    

    for label in labels[1:]:
        if label in labels_actual:
            i = labels_actual.index(label)+1
            x = np.load(np_folder+ '/' + label + '.npy')
            X = np.vstack((X, x))
            y = np.append(y, np.full(x.shape[0], fill_value= (i)))
        else:
            x = np.load(np_folder+ '/' + label + '.npy')
            X = np.vstack((X, x))
            y = np.append(y, np.full(x.shape[0], fill_value= (11))) 
            
    assert X.shape[0] == len(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, shuffle=True)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = prepare_training_data(path)

In [None]:
np.unique(y_train)

In [None]:
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [None]:
X_train.shape

In [None]:
y_train_cat.shape

# Conv1D - 1 layer only + FCN 

In [68]:
model = Sequential()
model.add(Conv1D(32, kernel_size= 2, activation='relu', input_shape=(12,51)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.25))
#model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(12, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])

In [69]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 11, 32)            3296      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 32)                0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 128)               4224      
_________________________________________________________________
dropout_22 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_23 (Dropout)         (None, 128)               0         
__________

In [None]:
model.fit(X_train, y_train_cat, batch_size=1024, epochs=50,verbose=1)

In [None]:
model_json_conv1d_1layer_mfcc = model.to_json()
with open("models/model_conv1d_1layer_mfcc.json","w") as json_file:
    json_file.write(model_json_conv1d_1layer_mfcc)

In [None]:
model.save_weights("models/model_conv1d_1layer_mfcc.h5")
print("saved model")

In [None]:
score = model.evaluate(X_test, y_test_cat, verbose=1)
print(score)

In [None]:
y_pred = model.predict(X_test)
y_pred_val = []
for row in y_pred:
    y_pred_val.append(np.argmax(row))
y_pred_val = to_categorical(y_pred_val)

In [None]:
print(classification_report(y_test_cat,y_pred_val, target_names= labels_actual))

# Conv2D - 1 layer only + FCN

In [131]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(12, 51,1)))
model.add(MaxPooling2D(pool_size=(2, 2),padding ='same'))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(12, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])

In [132]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_30 (Conv2D)           (None, 11, 50, 32)        160       
_________________________________________________________________
max_pooling2d_24 (MaxPooling (None, 6, 25, 32)         0         
_________________________________________________________________
dropout_69 (Dropout)         (None, 6, 25, 32)         0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 4800)              0         
_________________________________________________________________
dense_54 (Dense)             (None, 128)               614528    
_________________________________________________________________
dropout_70 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_55 (Dense)             (None, 128)               16512     
__________

In [None]:
X_train_2d = X_train.reshape(X_train.shape[0], 12, 51, 1)
X_test_2d = X_test.reshape(X_test.shape[0], 12, 51, 1)

In [None]:
model.fit(X_train_2d, y_train_cat, batch_size=1024, epochs=50,verbose=1)

In [None]:
model.save_weights("models/model_conv2d_1layer_mfcc.h5")
print("saved model")

In [None]:
model_json_conv2d_1layer_mfcc = model.to_json()
with open("models/model_conv2d_1layer_mfcc.json","w") as json_file:
    json_file.write(model_json_conv2d_1layer_mfcc)

In [None]:
score = model.evaluate(X_test_2d, y_test_cat, verbose=1)
score

In [None]:
y_pred = model.predict(X_test_2d)
y_pred_val = []
for row in y_pred:
    y_pred_val.append(np.argmax(row))
y_pred_val = to_categorical(y_pred_val)

In [None]:
print(accuracy_score(y_test_cat,y_pred_val))

In [None]:
print(classification_report(y_test_cat,y_pred_val, target_names= labels_actual))

# Conv1D 3 layers - 100 epochs + FCN (Dense Network)

In [122]:
model = Sequential()
model.add(Conv1D(16, kernel_size= 2, activation='relu', input_shape=(12,51)))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(MaxPooling1D())

model.add(Conv1D(32, kernel_size= 4, activation='relu',padding = 'same'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(MaxPooling1D())

model.add(Conv1D(64, kernel_size= 6, activation='relu', padding = 'same'))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(MaxPooling1D())


model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(12, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])

In [123]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_21 (Conv1D)           (None, 11, 16)            1648      
_________________________________________________________________
batch_normalization_23 (Batc (None, 11, 16)            64        
_________________________________________________________________
dropout_63 (Dropout)         (None, 11, 16)            0         
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 5, 16)             0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 5, 32)             2080      
_________________________________________________________________
batch_normalization_24 (Batc (None, 5, 32)             128       
_________________________________________________________________
dropout_64 (Dropout)         (None, 5, 32)             0         
__________

In [None]:
 model.fit(X_train, y_train_cat, batch_size=1024, epochs=50, verbose=1)

In [125]:
model_json_conv1d_3layer_100 = model.to_json()
with open("./model_conv1d_3layer_100.json","w") as json_file:
    json_file.write(model_json_conv1d_3layer_100)

In [126]:
model.save_weights("./model_conv1d_3layer_100.h5")
print("saved model")

saved model


In [127]:
score = model.evaluate(X_test, y_test_cat, verbose=1)
score



[0.682146965027999, 0.7851468824940048]

In [128]:
y_pred = model.predict(X_test)
y_pred_val = []
for row in y_pred:
    y_pred_val.append(np.argmax(row))
y_pred_val = to_categorical(y_pred_val)

In [129]:
print(accuracy_score(y_test_cat,y_pred_val))

0.7851468824940048


In [130]:
print(classification_report(y_test_cat,y_pred_val, target_names= labels_actual))

             precision    recall  f1-score   support

        yes       0.76      0.46      0.58       506
         no       0.87      0.82      0.84       450
         up       0.62      0.40      0.49       468
       down       0.55      0.49      0.52       524
       left       0.83      0.20      0.32       457
      right       0.73      0.61      0.66       465
         on       0.70      0.37      0.49       456
        off       0.78      0.63      0.70       487
       stop       0.81      0.74      0.77       474
         go       0.60      0.37      0.46       507
    silence       0.85      0.98      0.91       386
    unknown       0.81      0.94      0.87      8164

avg / total       0.78      0.79      0.77     13344



# Conv2D 3 layers - 100 epochs + FCN (Dense Network)

In [113]:
model = Sequential()
model.add(Conv2D(16, kernel_size=(2, 2), padding = 'same', activation='relu', input_shape=(12, 51,1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(32, kernel_size=(4, 4), padding = 'same',activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, kernel_size=(6, 6), padding = 'same',activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(12, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=Adam(lr = 0.001),
              metrics=['accuracy'])

In [114]:
X_train_2d = X_train.reshape(X_train.shape[0], 12, 51, 1)
X_test_2d = X_test.reshape(X_test.shape[0], 12, 51, 1)

In [None]:
model.fit(X_train_2d, y_train_cat, batch_size=1024, epochs=50, verbose=1)

In [116]:
model_json_conv2d_3layer_mel = model.to_json()
with open("./model_conv2d_3layer_mel.json","w") as json_file:
    json_file.write(model_json_conv2d_3layer_mel)

In [117]:
model.save_weights("./model_conv2d_3layer_mel.h5")
print("saved model")

saved model


In [118]:
score = model.evaluate(X_test_2d, y_test_cat, verbose=1)
print(score)

[0.20691717945807558, 0.938923860911271]


In [119]:
y_pred = model.predict(X_test_2d)
y_pred_val = []
for row in y_pred:
    y_pred_val.append(np.argmax(row))
y_pred_val = to_categorical(y_pred_val)

In [120]:
print(accuracy_score(y_test_cat,y_pred_val))

0.938923860911271


In [121]:
print(classification_report(y_test_cat,y_pred_val, target_names= labels_actual))

             precision    recall  f1-score   support

        yes       0.90      0.91      0.91       506
         no       0.96      0.97      0.97       450
         up       0.85      0.88      0.86       468
       down       0.96      0.82      0.89       524
       left       0.84      0.86      0.85       457
      right       0.99      0.89      0.94       465
         on       0.84      0.86      0.85       456
        off       0.89      0.90      0.89       487
       stop       0.96      0.93      0.95       474
         go       0.87      0.74      0.80       507
    silence       0.98      1.00      0.99       386
    unknown       0.96      0.97      0.97      8164

avg / total       0.94      0.94      0.94     13344



# Final Model - LSTM  + CNN - Without Data Augmentation

In [142]:
model = Sequential()

model.add(Conv2D(256, kernel_size = (5,5), input_shape = (12, 51, 1), padding = 'same'))
model.add(MaxPooling2D(pool_size = (5, 5), strides = (1,1), padding = 'same'))
model.add(BatchNormalization())

model.add(Conv2D(128, kernel_size = (3,2), activation = 'relu'))
model.add(BatchNormalization())

model.add(Conv2D(64, kernel_size = (2,2)))
model.add(MaxPooling2D(pool_size = (2, 2), strides = (1,1)))
model.add(BatchNormalization())

model.add(Conv2D(1, kernel_size = (3,3), padding = 'same'))

model.add(Reshape((8, 48)))

model.add(LSTM(256, return_sequences = True))
model.add(LSTM(256, return_sequences = True))

model.add(Dense(128))

model.add(TimeDistributed(Dense(64)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(12))
model.add(Activation('softmax'))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss="categorical_crossentropy", optimizer = adam, metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_44 (Conv2D)           (None, 12, 51, 256)       6656      
_________________________________________________________________
max_pooling2d_38 (MaxPooling (None, 12, 51, 256)       0         
_________________________________________________________________
batch_normalization_26 (Batc (None, 12, 51, 256)       1024      
_________________________________________________________________
conv2d_45 (Conv2D)           (None, 10, 50, 128)       196736    
_________________________________________________________________
batch_normalization_27 (Batc (None, 10, 50, 128)       512       
_________________________________________________________________
conv2d_46 (Conv2D)           (None, 9, 49, 64)         32832     
_________________________________________________________________
max_pooling2d_39 (MaxPooling (None, 8, 48, 64)         0         
__________

In [143]:
X_train_2d = X_train.reshape(X_train.shape[0], 12, 51, 1)
X_test_2d = X_test.reshape(X_test.shape[0], 12, 51, 1)

In [None]:
model.fit(X_train_2d, y_train_cat, batch_size=256, epochs=20, verbose=1)

In [145]:
model_json_conv2d_lstm_mfcc_51 = model.to_json()
with open("models/model_conv2d_lstm_mfcc_51.json","w") as json_file:
    json_file.write(model_json_conv2d_lstm_mfcc_51)

In [146]:
model.save_weights("models/model_conv2d_lstm_mfcc_51.h5")
print("saved model")

saved model


In [147]:
score = model.evaluate(X_test_2d, y_test_cat, verbose=1)
print(score)

[0.3330962593990771, 0.9318794964028777]


In [150]:
y_pred = model.predict(X_test_2d)
y_pred_val = []
for row in y_pred:
    y_pred_val.append(np.argmax(row))
y_pred_val = to_categorical(y_pred_val)

In [151]:
print(accuracy_score(y_test_cat,y_pred_val))

0.9318794964028777


In [152]:
print(classification_report(y_test_cat,y_pred_val, target_names= labels_actual))

             precision    recall  f1-score   support

        yes       0.97      0.83      0.89       506
         no       0.98      0.92      0.95       450
         up       0.78      0.91      0.84       468
       down       0.89      0.87      0.88       524
       left       0.91      0.79      0.85       457
      right       0.93      0.89      0.91       465
         on       0.84      0.86      0.85       456
        off       0.84      0.90      0.87       487
       stop       0.93      0.92      0.93       474
         go       0.86      0.81      0.84       507
    silence       0.99      0.99      0.99       386
    unknown       0.95      0.97      0.96      8164

avg / total       0.93      0.93      0.93     13344



# Final Model - LSTM  + CNN - With Data Augmentation

In [102]:
model = Sequential()

model.add(Conv2D(256, kernel_size = (5,5), input_shape = (12, 51, 1), padding = 'same'))
model.add(MaxPooling2D(pool_size = (5, 5), strides = (1,1), padding = 'same'))
model.add(BatchNormalization())

model.add(Conv2D(128, kernel_size = (3,2), activation = 'relu'))
model.add(BatchNormalization())

model.add(Conv2D(64, kernel_size = (2,2)))
model.add(MaxPooling2D(pool_size = (2, 2), strides = (1,1)))
model.add(BatchNormalization())

model.add(Conv2D(1, kernel_size = (3,3), padding = 'same'))

model.add(Reshape((8, 48)))

model.add(LSTM(256, return_sequences = True))
model.add(LSTM(256, return_sequences = True))

model.add(Dense(128))

model.add(TimeDistributed(Dense(64)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(12))
model.add(Activation('softmax'))

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss="categorical_crossentropy", optimizer = adam, metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_124 (Conv2D)          (None, 12, 51, 256)       6656      
_________________________________________________________________
max_pooling2d_98 (MaxPooling (None, 12, 51, 256)       0         
_________________________________________________________________
batch_normalization_115 (Bat (None, 12, 51, 256)       1024      
_________________________________________________________________
conv2d_125 (Conv2D)          (None, 10, 50, 128)       196736    
_________________________________________________________________
batch_normalization_116 (Bat (None, 10, 50, 128)       512       
_________________________________________________________________
conv2d_126 (Conv2D)          (None, 9, 49, 64)         32832     
_________________________________________________________________
max_pooling2d_99 (MaxPooling (None, 8, 48, 64)         0         
__________

In [103]:
X_train_2d = X_train.reshape(X_train.shape[0], 12, 51, 1)
X_test_2d = X_test.reshape(X_test.shape[0], 12, 51, 1)

In [None]:
model.fit(X_train_2d, y_train_cat, batch_size=256, epochs=20, verbose=1)

In [105]:
model_json_conv2d_lstm_mfcc_aug = model.to_json()
with open("models/model_conv2d_lstm_mfcc_aug.json","w") as json_file:
    json_file.write(model_json_conv2d_lstm_mfcc_aug)

In [106]:
model.save_weights("models/model_conv2d_lstm_mfcc_aug.h5")
print("saved model")

saved model


In [None]:
score = model.evaluate(X_test_2d, y_test_cat, verbose=1)
print(score)

In [108]:
y_pred = model.predict(X_test_2d)
y_pred_val = []
for row in y_pred:
    y_pred_val.append(np.argmax(row))
y_pred_val = to_categorical(y_pred_val)

In [None]:
print(accuracy_score(y_test_cat,y_pred_val))

In [110]:
print(classification_report(y_test_cat,y_pred_val, target_names= labels_actual))

             precision    recall  f1-score   support

        yes       0.91      0.90      0.91      1925
         no       0.98      0.92      0.95      1933
         up       0.91      0.85      0.88      1900
       down       0.91      0.87      0.89      1917
       left       0.95      0.77      0.85      1811
      right       0.93      0.89      0.91      1959
         on       0.89      0.86      0.88      1932
        off       0.93      0.86      0.89      1854
       stop       0.97      0.85      0.91      1891
         go       0.85      0.82      0.84      1817
    silence       0.99      0.99      0.99       801
    unknown       0.94      0.98      0.96     32837

avg / total       0.94      0.94      0.94     52577

