<a href="https://colab.research.google.com/github/sfansaria/Automated-Speech-Recognition-System-for-Spoken-digits-Using-Deep-Learning/blob/main/ASR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
main.py for running the main training code
CNN_model.py has all the model classes
plot_results.py plots the graphs by loading model history
model_load.py loads a given model and plots confusion matrix
'''


import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from CNN_model import CNNSpeech
from CNN_model import ResNetSpeech
from CNN_model import CNNSpeechRegularised
from keras.utils.vis_utils import plot_model
import pickle
from keras import backend as K



# building dataset from image directory
def dataset(train,val):
    train_set = keras.utils.image_dataset_from_directory(
        directory=train,
        labels="inferred",
        color_mode="grayscale",
        label_mode="categorical",
        batch_size=128,
        image_size=(98, 50))

    val_set = keras.utils.image_dataset_from_directory(
        directory=val,
        labels="inferred",
        color_mode="grayscale",
        label_mode="categorical",
        batch_size=128,
        image_size=(98, 50))
    return train_set, val_set


#main training function
def train(train_set,val_set, model, batch_size, epochs):

    # compile the keras model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit the keras model on the dataset
    history = model.fit(train_set,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=True, validation_data=val_set)

    return model, history

# training function for learning rate decay experiments in two stages
def train_learning_rate_exp(train_set,val_set, model, batch_size, epochs):

    # compile the keras model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit the keras model on the dataset
    history = model.fit(train_set,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=True, validation_data=val_set)
    train_acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    K.set_value(model.optimizer.learning_rate, 0.0001)
    history = model.fit(train_set,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=True, validation_data=val_set)

    train_acc = train_acc + history.history['accuracy']
    val_acc = val_acc + history.history['val_accuracy']

    plt.plot(train_acc)
    plt.plot(val_acc)
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    return model, history


# elementary plotting function
def plot(history):
    print(history.history.keys())
    # history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()


In [None]:
#model evaluation
def evaluate_model(model, val_set):
    # Generate generalization metrics
    #score = model.evaluate(val_set, verbose=1)
    #print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')
    _, accuracy = model.evaluate(val_set)
    print('Accuracy: %.2f' % (accuracy*100))


def save_model(model):
    model.save_weights("saved_models/model_weights", save_format='tf')


def show_model(model,filename):
    plot_model(model, to_file=filename, show_shapes=True, show_layer_names=True)

# save model training history
def save_history(history,filename):
    outbuffer = open(filename, 'wb')
    pickle.dump(history,outbuffer)
    outbuffer.close()

# load model history given a path
def load_history(filename):
    inbuffer = open(filename,'rb')
    history = pickle.load(inbuffer)
    return history


def save_model(model,filename):
    model.save(filename)

if __name__ == '__main__':
    train_data = "/home/saba/PycharmProjects/saba/speechImageData/TrainingData"
    val_data = "/home/saba/PycharmProjects/saba/speechImageData/ValidationData"
    train_set, val_set = dataset(train_data,val_data)
    model = CNNSpeech()
    # model = CNNSpeechRegularised()
    # model = ResNetSpeech()
    batch_size = 128
    epochs = 100
    model, history = train(train_set,val_set,model,batch_size,epochs)
    # model, history = train_learning_rate_exp(train_set, val_set, model, batch_size, epochs)
    print(model.optimizer.learning_rate)
    plot(history)
    evaluate_model(model, val_set)
    save_history(history, "history/test.hist")
    save_model(model, "model/test")
    print(model.summary())



In [None]:
'''
model class file
'''

import tensorflow as tf
from keras.models import Sequential
from keras.layers import BatchNormalization, MaxPool2D
from keras.layers import Conv2D
from keras.layers import Activation
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.models import load_model
from tensorflow import keras


class CNNSpeech(tf.keras.Model):

  def __init__(self):
    super().__init__()
    num_of_blocks = 4
    self.input_cnn = Conv2D(32, (3, 3), strides=(1, 1), activation="relu")
    self.cnn_block = Sequential()
    # dynamic creation of convolution blocks
    for layers in range(num_of_blocks):
        self.cnn_block.add(Conv2D(512, (3, 3), strides=(1, 1), activation="relu"))
        self.cnn_block.add(BatchNormalization())
        self.cnn_block.add(MaxPool2D((2,2),padding='same'))
    self.flatten = Flatten()
    self.dense1 = Dense(512, activation="relu")
    self.dense2 = Dense(12, activation="softmax")

  def call(self, inputs):
    # print(inputs.shape)
    x = self.input_cnn(inputs)
    # print(x.shape)
    x = self.cnn_block(x)
    # print(x.shape)
    x = self.flatten(x)
    x = self.dense1(x)
    x = self.dense2(x)
    return x


In [None]:
class CNNSpeechRegularised(tf.keras.Model):

  def __init__(self):
    super().__init__()
    num_of_blocks = 4
    self.input_cnn = Conv2D(32, (3, 3), strides=(1, 1), activation="relu")
    self.cnn_block = Sequential()
    # dynamic creation of convolution blocks
    for layers in range(num_of_blocks):
        self.cnn_block.add(Conv2D(512, (3, 3), strides=(1, 1), activation="relu"))
        self.cnn_block.add(BatchNormalization())
        self.cnn_block.add(MaxPool2D((2,2),padding='same'))
    self.flatten = Flatten()
    self.dense1 = Dense(512, activation="relu",
    kernel_regularizer=keras.regularizers.l2(l=0.01))
    self.dense2 = Dense(128, activation="relu")
    self.dense3 = Dense(17, activation="softmax")

  def call(self, inputs):
    # print(inputs.shape)
    x = self.input_cnn(inputs)
    # print(x.shape)
    x = self.cnn_block(x)
    # print(x.shape)
    x = self.flatten(x)
    x = self.dense1(x)
    x = self.dense2(x)
    x = self.dense3(x)
    return x

In [None]:
class ResNetSpeech(tf.keras.Model):
  def __init__(self):
    super().__init__()
    num_of_blocks = 2
    self.input_cnn = Conv2D(32, (3, 3), strides=(1, 1), activation="relu")
    self.cnn_block1 = Sequential()
    # dynamic creation of convolution blocks
    for layers in range(num_of_blocks):
        self.cnn_block1.add(Conv2D(128, (3, 3), strides=(1, 1), activation="relu"))
        self.cnn_block1.add(BatchNormalization())
        self.cnn_block1.add(MaxPool2D((2,2),padding='same'))

    self.cnn_block2 = Sequential()
    for layers in range(num_of_blocks):
      self.cnn_block2.add(Conv2D(128, (1, 1), strides=(1, 1), activation="relu"))
      self.cnn_block2.add(BatchNormalization())
      # self.cnn_block2.add(MaxPool2D((2, 2), padding='same'))

    self.cnn_block3 = Sequential()
    for layers in range(num_of_blocks):
      self.cnn_block3.add(Conv2D(256, (3, 3), strides=(1, 1), activation="relu"))
      self.cnn_block3.add(BatchNormalization())
      self.cnn_block3.add(MaxPool2D((2, 2), padding='same'))

    self.cnn_block4 = Sequential()
    for layers in range(num_of_blocks):
      self.cnn_block4.add(Conv2D(256, (1, 1), strides=(1, 1), activation="relu"))
      self.cnn_block4.add(BatchNormalization())
      # self.cnn_block2.add(MaxPool2D((2, 2), padding='same'))

    self.flatten = Flatten()
    self.dense1 = Dense(256, activation="relu",
    kernel_regularizer=keras.regularizers.l2(l=0.01))
    self.dense2 = Dense(128, activation="sigmoid")
    self.dense3 = Dense(12, activation="sigmoid")

  def call(self, inputs):
    # print(inputs.shape)
    x = self.input_cnn(inputs)
    # print(x.shape)
    x = self.cnn_block1(x)
    # print(x.shape)
    x_ = x
    x = self.cnn_block2(x)
    # print(x.shape)
    x = tf.keras.layers.Add()([x, x_])
    # print(x.shape)
    x = self.cnn_block3(x)
    # print(x.shape)
    x_ = x
    x = self.cnn_block4(x)
    # print(x.shape)
    x = tf.keras.layers.Add()([x, x_])
    x = self.flatten(x)
    x = self.dense1(x)
    x = self.dense2(x)
    x = self.dense3(x)
    return x








In [None]:
'''
This program does different types of plotting based on model comparisons
'''

import pickle
import matplotlib.pyplot as plt


def plot_three_comparison(history1,history2,history3):
    print(history1.history.keys())
    # summarize history for test accuracy
    plt.plot(history1.history['val_accuracy'])
    plt.plot(history2.history['val_accuracy'])
    plt.plot(history3.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['4 CNNBlocks each 128 filter', '4 CNNBlocks each 256 filter',
    '4 CNNBlocks each 512 filter'], loc='lower right')
    plt.grid(True)
    plt.minorticks_on()
    plt.show()
    # summarize history for loss
    print(history1.history.keys())
    # summarize history for test accuracy
    plt.plot(history1.history['val_loss'])
    plt.plot(history2.history['val_loss'])
    plt.plot(history3.history['val_loss'])
    ax = plt.gca()
    ax.set_ylim([0, 1])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['4 CNNBlocks each 128 filter', '4 CNNBlocks each 256 filter',
    '4 CNNBlocks each 512 filter'], loc='upper right')
    plt.grid(True)
    plt.minorticks_on()
    plt.show()


def plot_two_comparison(history1,history2):
    print(history1.history.keys())
    # summarize history for test accuracy
    plt.plot(history1.history['val_accuracy'])
    plt.plot(history2.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['4 CNNBlocks each 512 filter with regularisation',
    '4 CNNBlocks each 512 filter'], loc='lower right')
    plt.grid(True)
    plt.minorticks_on()
    plt.show()
    # summarize history for loss
    print(history1.history.keys())
    # summarize history for test accuracy
    plt.plot(history1.history['val_loss'])
    plt.plot(history2.history['val_loss'])
    ax = plt.gca()
    ax.set_ylim([0, 1])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['4 CNNBlocks each 512 filter with regularisation',
    '4 CNNBlocks each 512 filter'], loc='upper right')
    plt.grid(True)
    plt.minorticks_on()
    plt.show()


def plot(history):
    print(history.history.keys())
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train_CNN', 'test_CNN'], loc='upper left')
    plt.minorticks_on()
    plt.grid(True)
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    ax = plt.gca()
    ax.set_ylim([0, 2])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train_CNN', 'test_CNN'], loc='upper left')
    plt.minorticks_on()
    plt.grid(True)
    plt.show()


def load_history(filename):
    inbuffer = open(filename,'rb')
    history = pickle.load(inbuffer)
    return history


if __name__ == '__main__':
    history1 = load_history("/home/saba/PycharmProjects/saba/
    history/model_4_block_128.hist")
    history2 = load_history("/home/saba/PycharmProjects/saba/
    history/model_4_block_256.hist")
    history3 = load_history("/home/saba/PycharmProjects/saba/
    history/model_4_block_512.hist")
    plot_three_comparison(history1, history2, history3)
    history1 = load_history("/home/saba/PycharmProjects/saba/
    history/model_4_block_512_regularised.hist")
    history2 = load_history("/home/saba/PycharmProjects/saba/
    history/model_4_block_512.hist")
    plot_two_comparison(history1,history2)
    history1 = load_history("/home/saba/PycharmProjects/saba/
    history/model_4_512_extendeddata.hist")
    plot(history1)



In [None]:
'''
load model and print confusion matrix.
'''
from CNN_model import CNNSpeech
from keras.utils.vis_utils import plot_model
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.metrics import Metric
from sklearn.metrics import plot_confusion_matrix
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt



def dataset(train,val):
    train_set = keras.utils.image_dataset_from_directory(
        directory=train,
        labels="inferred",
        color_mode="grayscale",
        label_mode="categorical",
        batch_size=128,
        image_size=(98, 50))

    val_set = keras.utils.image_dataset_from_directory(
        directory=val,
        labels="inferred",
        color_mode="grayscale",
        label_mode="categorical",
        batch_size=128,
        image_size=(98, 50))
    return train_set, val_set

if __name__ == '__main__':
    train_data = "/home/saba/PycharmProjects/saba/speechImageData/TrainingData"
    val_data = "/home/saba/PycharmProjects/saba/speechImageData/ValidationData"
    train_set, val_set = dataset(train_data,val_data)
    classes = ["yes","no","up","down","left","right","on","off","stop","go",
    "background", "unknown"]
    true_label = []
    pred_label = []
    model= keras.models.load_model("/home/saba/PycharmProjects/
    saba/model/model_resnet")
    model.compile()
    for data, label in val_set:
        y_pred = model.predict(data)
        y_pred = np.argmax(y_pred,axis=1)
        label = label.numpy()
        label = np.where(label==1)[1]
        true_label.append(label)
        pred_label.append(y_pred)

    y_pred = np.concatenate(pred_label,axis=0)
    y_true = np.concatenate(true_label,axis=0)
    print(y_pred.shape)
    print(y_true.shape)
    conf_obj = confusion_matrix(y_true, y_pred)
    display = ConfusionMatrixDisplay(confusion_matrix=conf_obj,
    display_labels=classes)
    display.plot()
    plt.show()


