We loaded the data from bengaliAI huggingface and made the following adjustments:

1. Normalized using bnunicodenormalizer
2. Tokenized using Wav2Vec2Processor.from_pretrained("arijitx/wav2vec2-xls-r-300m-bengali")
3. Removed special characters
4. Resampled at 16000Hz (required for wav2vec2 training later on)
5. Trimmed silence with the threshhold (amplitude < max_amplitude/30 ). This makes training faster
6. Uploaded to huggingface: https://huggingface.co/Lancelot53

In [None]:
%%capture
!pip install transformers
!pip install jiwer
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

In [None]:
from datasets import load_dataset

dataset = load_dataset("bengaliAI/cvbn")

In [None]:
dataset["train"]=dataset["train"].filter(lambda x,y: x>y, input_columns=["up_votes","down_votes"])
dataset["validation"]=dataset["validation"].filter(lambda x,y: x>y, input_columns=["up_votes","down_votes"])


In [None]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\—\‘\'\‚\“\”\…]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]) + " "
    return batch

dataset["train"]=dataset["train"].map(remove_special_characters)
dataset["validation"]=dataset["validation"].map(remove_special_characters)



dataset["train"]=dataset["train"].remove_columns(['up_votes','down_votes','age','gender','accent','locale','segment'])
dataset["validation"]=dataset["validation"].remove_columns(['up_votes','down_votes','age','gender','accent','locale','segment'])


In [None]:
!pip install bnunicodenormalizer
from bnunicodenormalizer import Normalizer 
bnorm=Normalizer()


def normalize(batch):
  
    _words = [bnorm(word)['normalized']  for word in batch["sentence"].split()]
    batch["sentence"] =  " ".join([word for word in _words if word is not None])
    return batch

dataset = dataset.map(normalize)

In [None]:

from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("arijitx/wav2vec2-xls-r-300m-bengali")

In [None]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
import numpy as np

def prepare_dataset(batch):

    batch["audio"]["array"] = np.trim_zeros(batch["audio"]["array"], 'fb')
    audio = batch["audio"]
    

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [None]:
dataset_train = dataset['train'].map(prepare_dataset, remove_columns=dataset.column_names)
dataset_validation = dataset['validation'].map(prepare_dataset, remove_columns=dataset.column_names)

In [None]:
def trim_silence(batch):
    arr = batch['input_values']
    
    try:
        _max = max(max(arr), -min(arr))
        old_length = len(arr)
        
        threshold = 30

        for i,e in enumerate(arr):
            if threshold*e>_max:
                break

        for j,e in enumerate(reversed(arr)):
            if threshold*e>_max:
                break

        batch['input_values'] = arr[i:old_length-j]
        batch['input_length'] = old_length -i -j
    except:
        print(batch['input_length'])
    return batch
    



In [None]:
dataset_train = dataset_train.map(trim_silence)

In [None]:
# dataset_train.push_to_hub("commonVoice_bn_train_preprocessed_trimmed")
# dataset_validation.push_to_hub("commonVoice_bn_validation_preprocessed")