# ECS 289G: Automatic Speech Recognition using Convolutional Neural Network-Recurrent Neural Network (CNN-RNN)

This notebook I have implemented the task of Automatic Speech Recognition (ASR) using Convolutional Neural Network-Recurrent Neural Network architecture. The model is being trained on the LJ Speech dataset.

In [1]:
!pip install mltu

Collecting mltu
  Downloading mltu-1.1.7-py3-none-any.whl (45 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting onnxruntime>=1.15.0 (from mltu)
  Downloading onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs (from onnxruntime>=1.15.0->mltu)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime>=1.15.0->mltu)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
# Importing the libraries
import os
import tarfile
from tqdm import tqdm
from urllib.request import urlopen
from io import BytesIO

In [6]:
# Downloading and unzipping the LJSpeech Dataset
def download_and_unzip(url, extract_to='/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Datasets', chunk_size=1024*1024):
    http_response = urlopen(url)

    data = b''
    iterations = http_response.length // chunk_size + 1
    for _ in tqdm(range(iterations)):
        data += http_response.read(chunk_size)

    os.makedirs(extract_to, exist_ok=True)

    tarFile = tarfile.open(fileobj=BytesIO(data), mode='r|bz2')
    tarFile.extractall(path=extract_to)
    tarFile.close()

dataset_path = '/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Datasets/LJSpeech-1.1'
if not os.path.exists(dataset_path):
    download_and_unzip('https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2', extract_to='/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Datasets')

In [7]:
# Defining the model configs
from datetime import datetime

from mltu.configs import BaseModelConfigs

class ModelConfigs(BaseModelConfigs):
    def __init__(self):
        super().__init__()
        self.model_path = os.path.join("/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Models/05_sound_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
        self.frame_length = 256
        self.frame_step = 160
        self.fft_length = 384

        self.vocab = "abcdefghijklmnopqrstuvwxyz'?! "
        self.input_shape = None
        self.max_text_length = None
        self.max_spectrogram_length = None

        self.batch_size = 8
        self.learning_rate = 0.0005
        self.train_epochs = 100
        self.train_workers = 20

In [8]:
# Creating the model configs
import pandas as pd
from mltu.preprocessors import WavReader

dataset_path = "/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Datasets/LJSpeech-1.1"
metadata_path = dataset_path + "/metadata.csv"
wavs_path = dataset_path + "/wavs/"

metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
metadata_df = metadata_df[["file_name", "normalized_transcription"]]

dataset = [[f"/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Datasets/LJSpeech-1.1/wavs/{file}.wav", label] for file, label in metadata_df.values.tolist()]

# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

max_text_length, max_spectrogram_length = 0, 0
for file_path, label in tqdm(dataset):
    spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
    valid_label = [c for c in label.lower() if c in configs.vocab]
    max_text_length = max(max_text_length, len(valid_label))
    max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0])
    configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]]

configs.max_spectrogram_length = max_spectrogram_length
configs.max_text_length = max_text_length
configs.save()

100%|██████████| 13100/13100 [1:49:45<00:00,  1.99it/s]


In [9]:
# Importing mltu specific libraries
from mltu.tensorflow.dataProvider import DataProvider
from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CERMetric, WERMetric

In [10]:
# Creating a data provider for the LJ speech dataset
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[
        WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
        ],
    transformers=[
        SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

INFO:DataProvider:Skipping Dataset validation...


In [11]:
# Splitting the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

In [12]:
import tensorflow as tf
from keras import layers
from keras.models import Model

from mltu.tensorflow.model_utils import residual_block, activation_layer

# The network we have built
def train_model(input_dim, output_dim, activation='leaky_relu', dropout=0.2):

    inputs = layers.Input(shape=input_dim, name="input")

    input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs)

    # Convolution layer 1
    x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)(input)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation='leaky_relu')

    # Convolution layer 2
    x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = activation_layer(x, activation='leaky_relu')

    # Reshaping the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)

    # RNN layers
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

    # Dense layer
    x = layers.Dense(256)(x)
    x = activation_layer(x, activation='leaky_relu')
    x = layers.Dropout(dropout)(x)

    # Classification layer
    output = layers.Dense(output_dim + 1, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=output)
    return model

In [13]:
# Creating the tensorflow model
model = train_model(
    input_dim = configs.input_shape,
    output_dim = len(configs.vocab),
    dropout=0.5
)

In [14]:
# Compiling the model
# I have used Adam optimier and CTCloss for this task
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
    loss=CTCloss(),
    metrics=[
        CERMetric(vocabulary=configs.vocab),
        WERMetric(vocabulary=configs.vocab)
        ],
    run_eagerly=False
)

In [15]:
!pip install tf2onnx

Collecting tf2onnx
  Downloading tf2onnx-1.15.1-py3-none-any.whl (454 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.7/454.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting onnx>=1.4.1 (from tf2onnx)
  Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx, tf2onnx
Successfully installed onnx-1.15.0 tf2onnx-1.15.1


In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

# Defining callbacks
earlystopper = EarlyStopping(monitor='val_CER', patience=20, verbose=1, mode='min')
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor='val_CER', verbose=1, save_best_only=True, mode='min')
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f'{configs.model_path}/logs', update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor='val_CER', factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode='auto')
model2onnx = Model2onnx(f"{configs.model_path}/model.h5")

# Training the model
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    workers=configs.train_workers
)

# Saving training and validation datasets as csv files
train_data_provider.to_csv(os.path.join(configs.model_path, 'train.csv'))
val_data_provider.to_csv(os.path.join(configs.model_path, 'val.csv'))

In [17]:
# Testing it on validation dataset of LJ Speech and obtaining the Average CER and WER
import typing
import numpy as np

from mltu.inferenceModel import OnnxInferenceModel
from mltu.preprocessors import WavReader
from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer

class WavToTextModel(OnnxInferenceModel):
    def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.char_list = char_list

    def predict(self, data: np.ndarray):
        data_pred = np.expand_dims(data, axis=0)

        preds = self.model.run(None, {self.input_name: data_pred})[0]

        text = ctc_decoder(preds, self.char_list)[0]

        return text

if __name__ == "__main__":
    import pandas as pd
    from tqdm import tqdm
    from mltu.configs import BaseModelConfigs

    configs = BaseModelConfigs.load("/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Models/05_sound_to_text/20231128/configs.yaml")

    model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False)

    df = pd.read_csv("/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Models/05_sound_to_text/20231128/val.csv").values.tolist()

    accum_cer, accum_wer = [], []
    for wav_path, label in tqdm(df):

        spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)

        padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]), (0, 0)), mode='constant', constant_values=0)

        text = model.predict(padded_spectrogram)

        true_label = "".join([l for l in label.lower() if l in configs.vocab])

        cer = get_cer(text, true_label)
        wer = get_wer(text, true_label)

        accum_cer.append(cer)
        accum_wer.append(wer)

    print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")

100%|██████████| 1310/1310 [08:16<00:00,  2.64it/s]

Average CER: 0.02472644738965785, Average WER: 0.09590515265714795





In [18]:
# Predicting the transcription for a random audio sample
spectrogram = WavReader.get_spectrogram("/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Datasets/LJSpeech-1.1/wavs/LJ010-0251.wav", frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]), (0, 0)), mode='constant', constant_values=0)
text = model.predict(padded_spectrogram)
print(text)

francis was sentenced to be hanged decapitated end quartered


In [19]:
# Testing it with custom audios (the audios which we obatined after running speaker diarization and audio splicing)
import os

directory_path = "/content/drive/MyDrive/Fall 2023/ECS 289G/Project/Dataset/spliced"
output_file_path = "/content/drive/MyDrive/Fall 2023/ECS 289G/Project/all_transcriptions_cnn-rnn.txt"

files = []

for filename in os.listdir(directory_path):
    files.append(filename)

sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Opening the output file once for writing
with open(output_file_path, 'w') as output_file:
    for filename in sorted_files:
        if os.path.isfile(os.path.join(directory_path, filename)):
            # Predicting the transcriptions for each of the spliced audio files
            file_path = os.path.join(directory_path, filename)
            spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
            padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]), (0, 0)), mode='constant', constant_values=0)
            result = model.predict(padded_spectrogram)
            speaker_id = filename.split('_')[1]
            transcription_text = result

            # Writing to the text file
            output_file.write(f"Speaker {speaker_id}: {transcription_text}\n")

            # Printing the transciption
            print(f"Speaker {speaker_id}: {transcription_text}\n")

    print(f"All transcriptions successfully stored in {output_file_path}")



Speaker 00: 

Speaker 00: thrus hosurby lurd

Speaker 01: kinohibminave winwoond honmer dloth

Speaker 00: ogothgoad yiton mintguorine goded i dirgardoewat miador tha indimidenth bevenbitinoaymabu and now une wooking re micouted froni howard you

Speaker 01: i am dues whoe gin on amaenron comb maminuam benop goth would enhumeweyman migament

Speaker 00: bone gride thom and the howe the thaughtered on throught moth not hudin fismarded

Speaker 00: chould

All transcriptions successfully stored in /content/drive/MyDrive/Fall 2023/ECS 289G/Project/all_transcriptions_cnn-rnn.txt
