In [1]:
%%capture
!pip install pyctcdecode
!python -m pip install pypi-kenlm
!pip install jiwer
!pip install bnunicodenormalizer

!pip install aksharamukha
!pip install -q torchaudio omegaconf



![](https://www.respeecher.com/hubfs/What-is-Text-to-Speech-TTS%29-Initial-Speech-Synthesis-Explained-Respeecher-voice-cloning-software.jpeg)

![](https://developer-blogs.nvidia.com/wp-content/uploads/2019/12/automatic-speech-recognition_updated.png)

![](https://www.researchgate.net/profile/Diana-Militaru/publication/299594444/figure/fig1/AS:346834426974208@1459703179403/The-block-diagram-of-an-automatic-speech-recognition-and-understanding-system.png)

in this notebook we will try to demonstrate how to calculate CER,WER metric on validation dataset using xls-r wav2vec2 model,we will be using public best available pretrained model from huggingface to demonstrate the metric calculation process. for understanding how to train wav2vec2 on this dataset please check our past work [wav2vec2 starter](https://www.kaggle.com/code/nazmuddhohaansary/wave2vec2-starter-for-dl-sprint-commonvoice)

# Imports

In [2]:
import os
import numpy as np
from tqdm.auto import tqdm
from glob import glob
from transformers import AutoFeatureExtractor, pipeline
import pandas as pd
import librosa
import IPython
from datasets import load_metric
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
import re
import gc
import wave
from scipy.io import wavfile
import scipy.signal as sps

import torchaudio
from IPython.display import Audio, display
from aksharamukha import transliterate
import random

from bnunicodenormalizer import Normalizer 

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True,nb_workers=8)


print(torch.__version__)
print(torchaudio.__version__)

bnorm=Normalizer()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
1.11.0
0.11.0


# Configs

In [3]:
#according to our experiment this is the best model -> arijitx/wav2vec2-xls-r-300m-bengali
from transformers import AutoTokenizer,AutoModelForCTC
class CFG:
    model_name = 'Anasss/bangasr' #arijitx/wav2vec2-large-xlsr-bengali,arijitx/wav2vec2-xls-r-300m-bengali, Tahsin-Mayeesha/wav2vec2-bn-300m
    valid_df_path = '../input/dlsprint/validation.csv'
    sample_sub_df_path = '../input/dlsprint/sample_submission.csv'
    valid = "../input/dlsprint/validation_files/"
    test = "../input/dlsprint/test_files/"
    valid_wav = '../input/validation-fileswav-format/validation_files_wav/'
    test_wav = '../input/test-wav-files-dl-sprint/test_files_wav/'
    batch_size = 48#not using this param now
#     tokenizer = '../input/bangla-asr-anas/token'
    single_SPEECH_FILE = "../input/dlsprint/validation_files/common_voice_bn_30620258.mp3"
    post_asr_corrector = False
    



# single sample inference demo

In [4]:
asr = pipeline("automatic-speech-recognition", model=CFG.model_name, device=0)
feature_extractor = AutoFeatureExtractor.from_pretrained(
        CFG.model_name, cache_dir=None, use_auth_token=False
    )
speech, sr = librosa.load(CFG.single_SPEECH_FILE, sr=feature_extractor.sampling_rate)
prediction = asr(
            speech, chunk_length_s=112, stride_length_s=None
        )

pred = prediction["text"]
pred


Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/381 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/32.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.83G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.2M [00:00<?, ?B/s]

'তার পিতার নাম কালীপ্রসন্ন ভট্টাচার্য।'

# check the original audio

In [5]:
IPython.display.Audio(CFG.single_SPEECH_FILE)

# Fix paths

In [6]:
df = pd.read_csv('../input/dlsprint/validation.csv')
directory ="../input/dlsprint/validation_files/"
df["path"]=df["path"].progress_apply(lambda x:os.path.join(directory,str(x)))
df.head(3)

  0%|          | 0/7747 [00:00<?, ?it/s]

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale
0,c0494c8220a53efec93f188e32be94d3c1832c48117423...,../input/dlsprint/validation_files/common_voic...,"কৃষি, সেবা, রেমিটেন্স, ব্যবসা ও অন্যান্য।",3.0,0.0,,,,bn
1,c0494c8220a53efec93f188e32be94d3c1832c48117423...,../input/dlsprint/validation_files/common_voic...,তিনি ছিলেন চাকমা ভাষার প্রথম আধুনিক গীতিকার।,6.0,1.0,,,,bn
2,c06b36547c86713d53bb2bf696a34b696de586c5ab1aa9...,../input/dlsprint/validation_files/common_voic...,ইংরেজির সাথে সাথে তাদের হিন্দী ও সংস্কৃত শিক্ষ...,3.0,1.0,,,,bn


# Custom dataset class

librosa with mp3 is super slow,so we will be using wav files for faster inference

In [7]:
class bn_asr_Dataset(Dataset):
    '''
    args:
        df      : path of the dataframe
        dir     : directory of sound files
    '''
    def __init__(self,df,dir):
        self.df = pd.read_csv(df)
        self.dir = dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
   
        #speech, _ = librosa.load(self.dir+self.df.path[i], sr=feature_extractor.sampling_rate) 
        path = self.dir+self.df.path[i]
        path = os.path.splitext(path)[0]+'.wav'
        # Read file
        sampling_rate, data = wavfile.read(path)
        # Resample data
        number_of_samples = round(len(data) * float(feature_extractor.sampling_rate) / sampling_rate)
        speech = sps.resample(data, number_of_samples)
        return speech
  


# making prediction on whole validation set

In [8]:
%%time
#single image inference
''' 
#super slow inference...

predictions = []
references = []
for i in range(len(df.path)):
    speech, sr = librosa.load(df.path[i], sr=feature_extractor.sampling_rate)
    prediction = asr(speech, chunk_length_s=112, stride_length_s=None)
    pred = prediction["text"]
    predictions.append(pred)
    references.append(df.sentence[i])
    
print(len(predictions),len(references))
'''

df = pd.read_csv(CFG.valid_df_path)

valid_dataset = bn_asr_Dataset(CFG.valid_df_path,CFG.valid_wav)#CFG.valid
predictions = []
references = []
# for i,pred_sentence in enumerate(tqdm(asr(valid_dataset, chunk_length_s=112, stride_length_s=None,batch_size=CFG.batch_size), total=len(valid_dataset))):
#     references.append(df.sentence[i])
#     predictions.append(pred_sentence['text'])
    
for i in range(len(valid_dataset)):
    pred = asr(valid_dataset.__getitem__(i), chunk_length_s=112, stride_length_s=None)
    references.append(df.sentence[i])
    predictions.append(pred['text'])
  

CPU times: user 15min 41s, sys: 5.49 s, total: 15min 47s
Wall time: 17min 37s


In [9]:
torch.cuda.empty_cache() 
gc.collect()
!nvidia-smi

Fri Aug 12 17:21:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P0    43W / 250W |   2167MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

# WER (word error rate) calculation process



![](https://miro.medium.com/max/700/1*MUGLdWm3zMYK7dLmyo3pqA.png)

**WER = 100 (insertions(INS) + substitutions(SUB) + deletions(DEL))**

![](http://www.italk2learn.eu/wp-content/uploads/2015/02/speech-bubble-image.png)

# CER (character error rate) calculation process



character error rate (cer) is a common metric of the performance of an automatic speech recognition system. This value indicates the percentage of characters that were incorrectly predicted. The lower the value, the better the performance of the ASR system with a CER of 0 being a perfect score.

CER calculation is based on the concept of [Levenshtein distance](https://towardsdatascience.com/evaluating-ocr-output-quality-with-character-error-rate-cer-and-word-error-rate-wer-853175297510#9bd1), where we count the minimum number of character-level operations required to transform the ground truth text (aka reference text) into the OCR output.

Character Error Rate (CER) formula :

![](https://miro.medium.com/max/700/1*KsWFDKnLI7mudmhbzGjc4w.png)

where:

* S = Number of Substitutions
* D = Number of Deletions
* I = Number of Insertions
* N = Number of characters in reference text (aka ground truth)

Let’s look at an example:

**Ground Truth Reference Text**: 809475127

**ASR Transcribed Output Text**: 80g475Z7

Several errors require edits to transform ASR output into the ground truth:

1. g instead of 9 (at reference text character 3)
2. Missing 1 (at reference text character 7)
3. Z instead of 2 (at reference text character 8)

With that, here are the values to input into the equation:

* Number of Substitutions (S) = 2
* Number of Deletions (D) = 1
* Number of Insertions (I) = 0
* Number of characters in reference text (N) = 9

Based on the above, we get (2 + 1 + 0) / 9 = 0.3333. When converted to a percentage value, the CER becomes 33.33%. This implies that every 3rd character in the sequence was incorrectly transcribed.

We repeat this calculation for all the pairs of transcribed output and corresponding ground truth, and take the mean of these values to obtain an overall CER percentage.

**Reference :** [Evaluate OCR Output Quality with Character Error Rate (CER) and Word Error Rate (WER)](https://towardsdatascience.com/evaluating-ocr-output-quality-with-character-error-rate-cer-and-word-error-rate-wer-853175297510#5aec)

# calculating metric on whole validation set

In [10]:

df = pd.DataFrame(columns=['predictions', 'references'])
df.predictions = predictions
df.references = references


# Unicode Normalizer


from [webinar suplimentary notebook :: DL SPRINT](https://www.kaggle.com/code/nazmuddhohaansary/webinar-suplimentary-notebook-dl-sprint)

In [11]:

def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

# df.predictions= df.predictions.parallel_apply(lambda x:normalize(x))
# df.references= df.references.parallel_apply(lambda x:normalize(x))
# df.to_csv('./results.csv',index = False) #use it for error analysis and other stuffs
# df.head(10)

# Without Post Processing

In [12]:
cer = load_metric("cer")
wer = load_metric("wer")

cer_score = cer.compute(predictions=df.predictions, references=df.references)
print("validation cer_score -> ",cer_score)
wer_score = wer.compute(predictions=df.predictions, references=df.references)
print("validation wer_score -> ",wer_score)

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

validation cer_score ->  0.02976367134398303
validation wer_score ->  0.10398584544226028


* without bnunicodenormalizer our cv was (as discussed [here](https://www.kaggle.com/competitions/dlsprint/discussion/334951)) :

**validation cer_score -> 0.09787704766628824**

**validation wer_score -> 0.30921300101701055**

* with bnunicodenormalizer our cv is :

**validation cer_score -> 0.09668792125091967**

**validation wer_score -> 0.3049220524108723**

# With  post processing

during error analysis using the results.csv file we've seen that the model is frequently missing to predict punctuations, almost all the sentences in ground truth ends with '।' but while predicting using the public best trained model we can see that the model is missing to predict '।' most of the times, so in the simple post processing code below we will check if the predicted sentence ends with '।' or not,if no then we forcefully add '।' at the end of the predicted sentence.

In [13]:
for i in range(len(df.predictions)):
    if(df.predictions[i][-1] == '।'):
        continue
    else:
        df.predictions[i] = df.predictions[i]+'।'

In [14]:
cer_score = cer.compute(predictions=df.predictions, references=df.references)
print("Final validation cer_score -> ",cer_score)
wer_score = wer.compute(predictions=df.predictions, references=df.references)
print("Final validation wer_score -> ",wer_score)

Final validation cer_score ->  0.02975952049378514
Final validation wer_score ->  0.10408336700149069


* without bnunicodenormalizer our post processed model's cv was (as discussed [here](https://www.kaggle.com/competitions/dlsprint/discussion/334951#1856670) ) :

**validation cer_score -> 0.09301847750965592**

**validation wer_score -> 0.28501372267654884**

* with bnunicodenormalizer our final cv (with post processing) is :

**validation cer_score -> 0.09173650517129109**

**validation wer_score -> 0.28054166260326835**

#  with bnunicodenormalizer we've got 0.004472060073280493 WER improvement

# Submission with post processing

In [15]:
df = pd.read_csv('../input/dlsprint/sample_submission.csv')
len(df.path)

7747

In [16]:
%%time

test_dataset = bn_asr_Dataset(CFG.sample_sub_df_path,CFG.test_wav)

# for i,prediction in enumerate(tqdm(asr(test_dataset, chunk_length_s=112, stride_length_s=None,batch_size=CFG.batch_size), total=len(test_dataset))):
#     df.sentence[i] = prediction["text"]
    
for i in range(len(test_dataset)):
    pred = asr(test_dataset.__getitem__(i), chunk_length_s=112, stride_length_s=None)
    
    #applying simple post processing with error handler
    try:
        if(pred["text"][-1] == '।'):
            df.sentence[i] = pred["text"]
        else:
            df.sentence[i] = pred["text"]+'।'
    except:
        print("predicted text at idx ",i," is -> ",pred["text"])
        df.sentence[i] = pred["text"]+'।'
        
df.sentence=df.sentence.parallel_apply(lambda x:normalize(x)) #unicode normalizer


predicted text at idx  3936  is ->  


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=969), Label(value='0 / 969'))), HB…

CPU times: user 16min 52s, sys: 6.46 s, total: 16min 58s
Wall time: 18min 48s


In [17]:
df.head(3)

Unnamed: 0,path,sentence
0,common_voice_bn_31675220.mp3,এছাড়াও নিউজিল্যান্ড ক্রিকেট দলের হয়েও খেলছেন ত...
1,common_voice_bn_31513116.mp3,এই ফল পাখি রাখায় কিন্তু নিচে পড়ে থাকা ফল খেলে ...
2,common_voice_bn_31558126.mp3,জন পরিকল্পিত।


In [18]:
df.to_csv('./submissiona.csv',index = False)
df.sentence[1]

'এই ফল পাখি রাখায় কিন্তু নিচে পড়ে থাকা ফল খেলে কুকুর অসুস্থ হয়ে পড়ে।'

In [19]:
IPython.display.Audio('../input/dlsprint/test_files/common_voice_bn_31675220.mp3')

In [20]:
df.sentence[0]

'এছাড়াও নিউজিল্যান্ড ক্রিকেট দলের হয়েও খেলছেন তিনি।'

In [21]:
df.sentence[80]

'প্রথমশ্রেণীর ক্রিকেট প্রতিযোগিতা শেল শীল্ডের উদ্বোধনী আসরে অংশ নেয়।'

# optional (post ASR correction attempt)

in this section we will try to implement the recent best research on POST OCR (optical character recognition) CORRECTION titled[ Post-OCR Document Correction with large Ensembles of Character
Sequence-to-Sequence Models](https://arxiv.org/pdf/2109.06264.pdf) this research work was done in ocr domain and not in ASR domain so i was thinking what will happen if we try this approach in ASR domain? **well if you never try you'll never know**.
The core of this system is a standard sequence-to-sequence model that can correct sequences of characters. In the below implementation, we used a Transformer as the sequence model, which takes as input a segment of characters from the document to correct, and the output is the corrected segment. To train this sequence model, it is necessary to align the raw documents with their corresponding correct transcriptions, which is not always straightforward.Since the output is not necessarily of the same length as the input (because of possible insertions or deletions of characters), a decoding method like Greedy Search or Beam Search
is needed to produce the most likely corrected sequence according to the model.
for the below experiment we will be using results.csv where references column contains actual clean annotation and predictions column contains output of STT model including errors


Be careful,
if you predict on train set using the best publicly available ASR bangla model from huggingface you will see the model making NaN prediction for many audio samples in train set,to get the index of those NaN output files i used the code below 


![](https://i.ibb.co/741XzvD/post-stt-corrector.png)

tried to train in versionn 5 already of this notebook [commonvoice_bn xls-r metric calculation](https://www.kaggle.com/code/mobassir/commonvoice-bn-xls-r-metric-calculation)

# load and infer

# silero-tts demo for bangla

**------------------------>>>>>>>>>>>>>>>>>>>>>>>>>>>  high level overview**

![](https://miro.medium.com/max/1400/1*MwgQEqWrRMeQXdPBmwCh4g.png)
![](https://www.researchgate.net/profile/Suhas-Mache/publication/304601298/figure/fig3/AS:667867041247263@1536243319820/Block-diagram-of-Text-to-Speech-System-Techniques-of-speech-synthesis-5-a.png)

even though this competition is all about STT (speech to text) recognition,however the dataset isn't limited to STT domain,you can work on TTS (text to speech) system using the dataset of this competition.we know that **Deep learning is data-hungry**. one good idea for possible improvement of your ASR system could be to try retraining your STT model with augmented data included in your pipeline like this [Transcoding & Augmenting Audio On-The-Fly](https://www.kaggle.com/code/shahruk10/transcoding-augmenting-audio-on-the-fly) but i was curious and also thinking that **can we also use the prediction result of a tts system for training our stt model? this is also similar to data augmentation technique but without having background noise,no?** 
sorry if i am wrong,i don't know if it will help or not,i am not expert in ASR domain,just a beginner who is sharing his naive thoughts. let's see how silero tts of torch.hub works with bangla.

**KEEP IN MIND -> because bengali is a low resource language, that's why except silero tts we don't have any better free tts system for bangla. beside bangla STT,we also need a powerful bangla TTS system as well.**


# improvement ideas


for better post ASR correction,i would like to recommend going deep in [ROBART](https://arxiv.org/pdf/2202.01157.pdf)
![](https://i.ibb.co/YWVGRVF/post-asr-corrector.png)

![](https://i.ibb.co/9sZbvD7/post-asr.png)
one example implementation of levenshtein transformer can be found [here](https://github.com/nmfisher/levenshtein_transformer/blob/master/Untitled.ipynb)

more about post ASR correction was discussed [here](https://www.kaggle.com/competitions/dlsprint/discussion/335411)

![](https://images.unsplash.com/photo-1499744937866-d7e566a20a61?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=870&q=80)