In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import librosa
import jiwer
import torch
import gc
import torchaudio
import torchaudio.transforms as tat

from IPython.display import display
from dataclasses import dataclass, field
from tqdm.notebook import tqdm
from bnunicodenormalizer import Normalizer
from torch.utils.data import Dataset, DataLoader
from typing import Any, Dict, List, Optional, Union
import wandb

# Import Hugging Face Transformers related modules
from transformers import (
    Wav2Vec2CTCTokenizer,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

from transformers.integrations import WandbCallback
# Additional libraries
import cloudpickle as cpkl

import warnings

# Suppress DeprecationWarnings
warnings.filterwarnings("ignore")

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
ORIG_SAMPLE_RATE = 32000
SAMPLE_RATE = 16000
torch.backends.cudnn.benchmark = True
OUTPUT_DIR = './'
MODEL_PATH = "ai4bharat/indicwav2vec_v1_bengali"

In [4]:
processor = Wav2Vec2Processor.from_pretrained(MODEL_PATH)
# tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(MODEL_PATH)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_PATH)
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [5]:
df = pd.read_csv("/home/moonlab/datasets/bengali/train.csv")
print(len(df))

963636


In [6]:
df.head()

Unnamed: 0,id,sentence,split
0,000005f3362c,ও বলেছে আপনার ঠিকানা!,train
1,00001dddd002,কোন মহান রাষ্ট্রের নাগরিক হতে চাও?,train
2,00001e0bc131,"আমি তোমার কষ্টটা বুঝছি, কিন্তু এটা সঠিক পথ না।",train
3,000024b3d810,নাচ শেষ হওয়ার পর সকলে শরীর ধুয়ে একসঙ্গে ভোজন...,train
4,000028220ab3,"হুমম, ওহ হেই, দেখো।",train


In [7]:
print("The unique values of 'df[split]' are :")
display(df['split'].unique())
print("*-"*50)

train_df = df[df['split'] == 'train']
valid_df = df[df['split'] == 'valid']
print("The *train* data is the following:")
display(train_df)
print(f"The len of train data is {len(train_df)}")
print("*-"*50)
print("The *valid* data is the following:")
display(valid_df)
print(f"The len of valid data is {len(valid_df)}")

The unique values of 'df[split]' are :


array(['train', 'valid'], dtype=object)

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
The *train* data is the following:


Unnamed: 0,id,sentence,split
0,000005f3362c,ও বলেছে আপনার ঠিকানা!,train
1,00001dddd002,কোন মহান রাষ্ট্রের নাগরিক হতে চাও?,train
2,00001e0bc131,"আমি তোমার কষ্টটা বুঝছি, কিন্তু এটা সঠিক পথ না।",train
3,000024b3d810,নাচ শেষ হওয়ার পর সকলে শরীর ধুয়ে একসঙ্গে ভোজন...,train
4,000028220ab3,"হুমম, ওহ হেই, দেখো।",train
...,...,...,...
963631,ffffd07108b7,আপনার সাথে কথা বলতে চাই।,train
963632,ffffde37678a,সুতরাং পরের দিন আর-একটা ছবি না লইয়া চিত্রকর ছা...,train
963633,ffffe1b5f095,"সামাজিক কর্মকাণ্ডসমিতিতে গিয়ে দেখা যায়, শিল্পী...",train
963634,ffffec31636e,গুগল ম্যাপসের সাহায্যে খুঁজে পাওয়া যাবে কোন জা...,train


The len of train data is 934048
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
The *valid* data is the following:


Unnamed: 0,id,sentence,split
20,0000e711c2b1,তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রত...,valid
59,00036c2a2d9d,কৃত্তিবাস রামায়ণ-বহির্ভূত অনেক গল্প এই অনুবাদ...,valid
100,00065e317123,তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাস...,valid
101,00065f40df52,তিনি বিজয়নগর সাম্রাজ্যের বিরুদ্ধে এবং বিজাপুর...,valid
146,0009b022c8ea,এটি মূলত একটি মরুময় অঞ্চল।,valid
...,...,...,...
963545,fffa8ced44f1,এই কাজের জন্য বিশেষ পারদর্শিতা থাকা শিল্পী থাকে।,valid
963586,fffd69cdd76c,আবদুল লতিফ আঞ্জুমানের সঙ্গে প্রত্যক্ষভাবে জড়ি...,valid
963589,fffd9ff269bc,"পত্রের বিষয়বস্তু হলো ""যান্ত্রিক গণনার মডুলেশন...",valid
963593,fffdb13febaf,"তবে, ব্যান্ডটির অফিসিয়াল ফেসবুক পাতায়, তাকে ...",valid


The len of valid data is 29588


In [8]:
class BengaliASRDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.paths = df['id'].values
        self.sentences = df['sentence'].str.split()
        self.sentences = df['sentence'].values
        self.resampler = tat.Resample(32000, SAMPLE_RATE)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        aud_path = f'/home/moonlab/datasets/bengali/train_mp3s/{self.paths[index]}.mp3'
        waveform, sr = librosa.load(aud_path, sr=SAMPLE_RATE)
        waveform = torch.from_numpy(waveform)
        waveform = self.resampler(waveform)
        
        batch = dict()
        y = processor(waveform.reshape(-1), sampling_rate=SAMPLE_RATE).input_values[0]
        batch["input_values"] = y
        with processor.as_target_processor():
            batch["labels"] = processor(self.sentences[index]).input_ids
        
        return batch

In [9]:
text = "সেই পাতায় লেখা আছে: শ্রীমতী হেমনলিনীর প্রতি অক্ষয়শ্রদ্ধার উপহার।"

words = text.split()

bnorm = Normalizer()

for word in words:
    result = bnorm(word)
    print(result['normalized'])

সেই
পাতায়
লেখা
আছে:
শ্রীমতী
হেমনলিনীর
প্রতি
অক্ষয়শ্রদ্ধার
উপহার।


In [10]:
train_dataset = BengaliASRDataset(train_df)
valid_dataset = BengaliASRDataset(valid_df)

In [11]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [12]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [13]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # wer = wer_metric.compute(predictions=pred_str, references=label_str)
    wer = jiwer.wer(label_str, pred_str)

    return {"wer": wer}

In [14]:
wandb.init(project="benga.ai", entity="bengali_asr")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mslam_louis[0m ([33mbengali_asr[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
model = Wav2Vec2ForCTC.from_pretrained(
    MODEL_PATH,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    ctc_zero_infinity= True,
    diversity_loss_weight=100
)

In [16]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    group_by_length=False,
    lr_scheduler_type='cosine',
    weight_decay=0.01,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    fp16=True,
    save_steps=20,
    eval_steps=20,
    logging_steps=20,
    learning_rate=2e-5,
    warmup_steps=600,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    prediction_loss_only=False,
    auto_find_batch_size=True,
    report_to="wandb"
)

In [17]:
trainer = Trainer(
    model=model,
    data_collator = data_collator,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor
)

In [18]:
trainer.train()
wandb.finish()
trainer.save_model(output_dir=OUTPUT_DIR)

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
