# Building an end-to-end Speech Recognition model in PyTorch - [AssemblyAI](https://www.assemblyai.com/)

## installing the requirements

In [1]:
# !pip install comet-ml==3.0.2 -qq

## imports

In [2]:
import os

from comet_ml import Experiment
from accelerate import Accelerator, notebook_launcher
from accelerate.utils import set_seed

import gc
import torch
import torch.nn as nn


import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np

from dataset import IPS1ASRDataset
from utils import *

[2023-09-05 09:51:37,712] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


## Setting up your data pipeline

In [3]:
class TextTransform:
    """Maps characters to integers and vice versa"""
    def __init__(self):
        char_map_arr = [
            '<SPACE>', 'а', 'ә', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'җ', 'з', 'и', 'й', 'к', 'л',
            'м', 'н', 'ң', 'о', 'ө', 'п', 'р', 'с', 'т', 'у', 'ү', 'ф', 'х', 'һ', 'ц', 'ч', 'ш', 'щ',
            'ъ', 'ы', 'ь', 'э', 'ю', 'я'
        ]

        self.char_map = {}
        self.index_map = {}
        for index in range(len(char_map_arr)):
            ch = char_map_arr[index]
            self.char_map[ch] = index
            self.index_map[index] = ch
        self.index_map[0] = ' '

    def text_to_int(self, text):
        """ Use a character map and convert text to 
        an integer sequence """
        int_sequence = []
        for c in text:
            if c == ' ':
                ch = self.char_map['<SPACE>']
            elif c in self.char_map:
                ch = self.char_map[c]
            else: 
                continue
            int_sequence.append(ch)
        return int_sequence

    def int_to_text(self, labels):
        """ Use a character map and convert integer labels to 
        an text sequence """
        string = []
        for i in labels:
            string.append(self.index_map[i])
        return ''.join(string).replace('<SPACE>', ' ')

In [4]:
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram()

text_transform = TextTransform()



In [5]:
def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        utterance = utterance.lstrip('\ufeff')
        
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0]//2)
        label_lengths.append(len(label))

    spectrograms = nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths

In [6]:
def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

## The Model
Base of of Deep Speech 2 with some personal improvements

In [7]:
class CNNLayerNorm(nn.Module):
    """Layer normalization built for cnns input"""
    def __init__(self, n_feats):
        super(CNNLayerNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(n_feats)

    def forward(self, x):
        # x (batch, channel, feature, time)
        x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature)
        x = self.layer_norm(x)
        return x.transpose(2, 3).contiguous() # (batch, channel, feature, time)

In [8]:
class ResidualCNN(nn.Module):
    """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf
        except with layer norm instead of batch norm
    """
    def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):
        super(ResidualCNN, self).__init__()

        self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)
        self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.layer_norm1 = CNNLayerNorm(n_feats)
        self.layer_norm2 = CNNLayerNorm(n_feats)

    def forward(self, x):
        residual = x  # (batch, channel, feature, time)
        x = self.layer_norm1(x)
        x = F.gelu(x)
        x = self.dropout1(x)
        x = self.cnn1(x)
        x = self.layer_norm2(x)
        x = F.gelu(x)
        x = self.dropout2(x)
        x = self.cnn2(x)
        x += residual
        return x # (batch, channel, feature, time)

In [9]:
class BidirectionalGRU(nn.Module):

    def __init__(self, rnn_dim, hidden_size, dropout, batch_first):
        super(BidirectionalGRU, self).__init__()

        self.BiGRU = nn.GRU(
            input_size=rnn_dim, hidden_size=hidden_size,
            num_layers=1, batch_first=batch_first, bidirectional=True)
        self.layer_norm = nn.LayerNorm(rnn_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer_norm(x)
        x = F.gelu(x)
        x, _ = self.BiGRU(x)
        x = self.dropout(x)
        return x

In [10]:
class SpeechRecognitionModel(nn.Module):

    def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):
        super(SpeechRecognitionModel, self).__init__()
        n_feats = n_feats//2
        self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2)  # cnn for extracting heirachal features

        # n residual cnn layers with filter size of 32
        self.rescnn_layers = nn.Sequential(*[
            ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)
            for _ in range(n_cnn_layers)
        ])
        self.fully_connected = nn.Linear(n_feats*32, rnn_dim)
        self.birnn_layers = nn.Sequential(*[
            BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,
                             hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)
            for i in range(n_rnn_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Linear(rnn_dim*2, rnn_dim),  # birnn returns rnn_dim*2
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(rnn_dim, n_class)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = self.rescnn_layers(x)
        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # (batch, feature, time)
        x = x.transpose(1, 2) # (batch, time, feature)
        x = self.fully_connected(x)
        x = self.birnn_layers(x)
        x = self.classifier(x)
        return x

## The Training and Evaluating Script

In [11]:
class IterMeter(object):
    """keeps track of total iterations"""
    def __init__(self):
        self.val = 0

    def step(self):
        self.val += 1

    def get(self):
        return self.val

In [12]:
def train(model, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, experiment, accelerator):
    model.train()
    data_len = len(train_loader.dataset)
    with experiment.train():
        for batch_idx, _data in enumerate(train_loader):
            torch.autograd.set_detect_anomaly(True)
            
            spectrograms, labels, input_lengths, label_lengths = _data
            optimizer.zero_grad()

            output = model(spectrograms)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)
            
            loss = criterion(output, labels, input_lengths, label_lengths)
            accelerator.backward(loss)

            experiment.log_metric('loss', loss.item(), step=iter_meter.get())
            experiment.log_metric('learning_rate', scheduler.get_lr(), step=iter_meter.get())

            optimizer.step()
            scheduler.step()
            iter_meter.step()
            if batch_idx % 100 == 0 or batch_idx == data_len:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(spectrograms), data_len,
                    100. * batch_idx / len(train_loader), loss.item()))

In [13]:
def test(model, test_loader, criterion, epoch, iter_meter, experiment, accelerator):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with experiment.test():
        with torch.no_grad():
            for i, _data in enumerate(test_loader):
                spectrograms, labels, input_lengths, label_lengths = _data

                output = model(spectrograms)  # (batch, time, n_class)
                output = F.log_softmax(output, dim=2)
                output = output.transpose(0, 1) # (time, batch, n_class)

                loss = criterion(output, labels, input_lengths, label_lengths)
                test_loss += loss.item() / len(test_loader)

                decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
                for j in range(len(decoded_preds)):
                    test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                    test_wer.append(wer(decoded_targets[j], decoded_preds[j]))


    avg_cer = sum(test_cer) / len(test_cer)
    avg_wer = sum(test_wer) / len(test_wer)
    experiment.log_metric('test_loss', test_loss, step=iter_meter.get())
    experiment.log_metric('cer', avg_cer, step=iter_meter.get())
    experiment.log_metric('wer', avg_wer, step=iter_meter.get())

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(
        test_loss, avg_cer, avg_wer))

## Setting up Comet
If you have a comet account, fill in teh api key, project name and experiment name below. You can create an account at [comet.ml](comet.ml).

## GPU runtime
If you are using a GPU runtime, this will let you know what GPU and how much memory is available. Adjust your batch_size depending on which GPU

In [14]:
torch.cuda.empty_cache()
gc.collect()

80

In [15]:
!nvidia-smi

Tue Sep  5 09:51:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   41C    P0    37W / 250W |   2586MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:02:00.0 Off |                    0 |
| N/A   48C    P0    31W / 250W |      4MiB / 32768MiB |      0%      Default |
|       

## Train
this will download the data on first run and may take a while.

If you have Comet.ml setup, you can start seeing your progress in the comet cell above.

In [16]:
def training_function():
    comet_api_key = "GUmMcuBnNsVBJjnslfRGmqKkI" # add your api key here
    project_name = "TatAsr"
    experiment_name = "TatAsr-cnn-rnn-accelerator-2gpu"

    experiment = Experiment(
        api_key=comet_api_key, 
        project_name=project_name, 
        parse_args=False, 
        log_code=True, 
        auto_output_logging="default"
    )

    experiment.set_name(experiment_name)

    set_seed(42)
    torch.manual_seed(7)
    
    accelerator = Accelerator()
    
    ips_dataset_train = IPS1ASRDataset('../tatar_tts/train/')
    ips_dataset_valid = IPS1ASRDataset('../tatar_tts/valid/')
    
    learning_rate = 0.001
    learning_rate *= 2
    batch_size = 128
    epochs = 30
    
    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 40, # Длина алфавита
        "n_feats": 128,
        "stride": 2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], 
        hparams['n_rnn_layers'], 
        hparams['rnn_dim'],
        hparams['n_class'], 
        hparams['n_feats'], 
        hparams['stride'], 
        hparams['dropout']
    )

    experiment.log_parameters(hparams)
    

    train_loader = data.DataLoader(
        dataset=ips_dataset_train,
        batch_size=hparams['batch_size'],
        shuffle=True,
        collate_fn=lambda x: data_processing(x, 'train'),
    )
    
    test_loader = data.DataLoader(
        dataset=ips_dataset_valid,
        batch_size=hparams['batch_size'],
        shuffle=False,
        collate_fn=lambda x: data_processing(x, 'valid'),
    )

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=39, zero_infinity=True) # Длина алфавита - 1
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=hparams['learning_rate'],
        steps_per_epoch=int(len(train_loader)),
        epochs=hparams['epochs'],
        anneal_strategy='linear'
    )

    iter_meter = IterMeter()
    
    model, optimizer, train_loader, test_loader, scheduler, experiment = accelerator.prepare(
        model, optimizer, train_loader, test_loader, scheduler, experiment
    )
    
    for epoch in range(1, epochs + 1):
        train(model, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, experiment, accelerator)
        test(model, test_loader, criterion, epoch, iter_meter, experiment, accelerator)
        accelerator.save_model(model, f'./models/TatAsr-1-accelerator-epoch-1-{epoch}')
    
    experiment.end()

In [17]:
notebook_launcher(training_function, num_processes=2)

Launching training on 2 GPUs.


COMET INFO: Experiment is live on comet.ml https://www.comet.com/gumaonelove/tatasr/5c5cfbf1114442d99fcbce45a00bd2a3

COMET INFO: Experiment is live on comet.ml https://www.comet.com/gumaonelove/tatasr/9d9d00c29ac7403ca87d06307f9d5ecb





evaluating...
evaluating...

Test set: Average loss: 4.9125, Average CER: 0.988174 Average WER: 1.0000

Test set: Average loss: 4.9383, Average CER: 0.988035 Average WER: 0.9990



evaluating...
evaluating...

Test set: Average loss: 3.9383, Average CER: 0.988174 Average WER: 1.0000

Test set: Average loss: 3.9241, Average CER: 0.988035 Average WER: 0.9990


evaluating...
evaluating...

Test set: Average loss: 3.6178, Average CER: 0.988035 Average WER: 0.9990

Test set: Average loss: 3.6187, Average CER: 0.988174 Average WER: 1.0000



evaluating...

evaluating...
Test set: Average loss: 3.6342, Average CER: 0.988174 Average WER: 1.0000

Test set: Average loss: 3.6366, Average CER: 0.988035 Average WER: 0.9990



evaluating...
evaluating...

Test set: Average loss: 3.6116, Average CER: 0.988174 Average WER: 1.0000

Test set: Average loss: 3.6209, Average CER: 0.988035 Average WER: 0.9990


evaluating...
evaluating...

Test set: Average loss: 3.6255, Average CER: 0.988035 Average WER:

## Результаты обучения
### Модель 
Использовалась модель из статьи [Building an End-to-End Speech Recognition Model in PyTorch](https://www.assemblyai.com/blog/end-to-end-speech-recognition-pytorch/) **Deep Speech 2**. 

### Гипер параметры
* `n_cnn_layers` = 3
* `n_rnn_layers` = 5
* `rnn_dim` = 512
* `n_class` = 29
* `n_feats` = 128
* `stride` = 2
* `dropout` = 0.1
* `learning_rate` = 0.001
* `batch_size` = 80
* `epochs` = 10
* `num_workers` = 8

### Метрики
Ниже будут приведены средний 
* `CER` - частота ошибок в символах
* `WER` - частота ошибок в словах

Формула расчета
**WER** = (S+D+I)/N = (S+D+I)/(S+D+C), где:

* **S** — количество замен
* **D** — количество удалений
* **I** — количество вставок
* **C** — количество корректных слов
* **N** — количество слов в исходной строке

Итоговык метрики 
* `CER` = 0.24 
* `WER` = 0.80

### Выводы
1. Обучать ASR на одной видео карте **tesla v100** вышеупомянутую модель 2.5 часа, из-за этого проблематично тестировать гипотезы и количество слоев в данной модели.