In [5]:
!pip install evaluate



In [12]:
# Import required libraries
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import (
    MT5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import re
from typing import List, Dict, Tuple
import logging
from datasets import load_dataset
from evaluate import load
import random
from tqdm.auto import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

In [13]:
# Data preprocessing and augmentation
class TextPreprocessor:
    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize text"""
        text = str(text).strip()
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = text.lower()  # Convert to lowercase
        return text
    
    @staticmethod
    def create_augmented_samples(text: str) -> List[str]:
        """Create augmented versions of the input text"""
        augmented = []
        
        # Original text
        augmented.append(text)
        
        # Add space between characters randomly
        chars = list(text)
        if len(chars) > 3:
            idx = random.randint(1, len(chars)-2)
            chars.insert(idx, ' ')
            augmented.append(''.join(chars))
        
        # Remove random character
        if len(text) > 3:
            idx = random.randint(1, len(text)-2)
            augmented.append(text[:idx] + text[idx+1:])
        
        return augmented

In [14]:
# Custom Dataset
class BanglishBengaliDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[str], tokenizer, max_length: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.preprocessor = TextPreprocessor()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.preprocessor.clean_text(self.texts[idx])
        label = str(self.labels[idx])

        # Add task prefix
        text = "transliterate Bengali: " + text

        # Tokenize inputs
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Tokenize labels
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                label,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze()
        }

In [15]:
# Metrics calculation
def compute_metrics(pred):
    """Calculate metrics for evaluation"""
    predictions, labels = pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Calculate character error rate (CER)
    cer_metric = load("cer")
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    # Calculate exact match score
    exact_matches = sum(1 for p, l in zip(decoded_preds, decoded_labels) if p.strip() == l.strip())
    exact_match_score = exact_matches / len(decoded_preds)
    
    return {
        "cer": cer,
        "exact_match": exact_match_score
    }

In [16]:
# Main Transliterator class
class ImprovedBanglishBengaliTransliterator:
    def __init__(
        self,
        model_name: str = "google/mt5-large",
        max_length: int = 128
    ):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        self.model_name = model_name
        self.max_length = max_length
        
        # Initialize tokenizer with special tokens
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        
        # Initialize model with better initialization
        self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
        self.model.to(self.device)

    def prepare_data(
        self,
        data_path: str,
        test_size: float = 0.1,
        apply_augmentation: bool = True
    ) -> Tuple[Dataset, Dataset]:
        """Load and prepare the dataset with augmentation"""
        logger.info("Loading dataset...")
        df = pd.read_csv(data_path)
        
        print(f"Initial dataset size: {len(df)}")
        
        # Clean the data
        df = df.dropna()
        preprocessor = TextPreprocessor()
        
        # Clean texts
        df['rm'] = df['rm'].apply(preprocessor.clean_text)
        df['bn'] = df['bn'].apply(str.strip)
        
        # Apply augmentation if enabled
        if apply_augmentation:
            augmented_data = []
            for _, row in tqdm(df.iterrows(), total=len(df), desc="Augmenting data"):
                augmented_texts = preprocessor.create_augmented_samples(row['rm'])
                for aug_text in augmented_texts:
                    augmented_data.append({
                        'rm': aug_text,
                        'bn': row['bn']
                    })
            
            df_augmented = pd.DataFrame(augmented_data)
            df = pd.concat([df, df_augmented], ignore_index=True)
            print(f"Dataset size after augmentation: {len(df)}")
        
        # Split the dataset
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            df['rm'].values, df['bn'].values,
            test_size=test_size,
            random_state=42
        )
        
        print(f"Training set size: {len(train_texts)}")
        print(f"Validation set size: {len(val_texts)}")

        # Create datasets
        train_dataset = BanglishBengaliDataset(
            train_texts, train_labels, self.tokenizer, self.max_length
        )
        val_dataset = BanglishBengaliDataset(
            val_texts, val_labels, self.tokenizer, self.max_length
        )

        return train_dataset, val_dataset

    def train(
        self,
        train_dataset: Dataset,
        val_dataset: Dataset,
        output_dir: str = "./improved-banglish-bengali-model",
        num_train_epochs: int = 5,
        per_device_train_batch_size: int = 8,
        gradient_accumulation_steps: int = 2,
        learning_rate: float = 2e-5,
        warmup_ratio: float = 0.1
    ):
        """Train the model with improved training strategy"""
        logger.info("Starting training...")
        
        # Calculate warmup steps
        num_update_steps_per_epoch = len(train_dataset) // (per_device_train_batch_size * gradient_accumulation_steps)
        max_steps = num_train_epochs * num_update_steps_per_epoch
        warmup_steps = int(max_steps * warmup_ratio)
        
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            per_device_eval_batch_size=per_device_train_batch_size * 2,
            evaluation_strategy="steps",
            eval_steps=num_update_steps_per_epoch // 2,
            save_strategy="steps",
            save_steps=num_update_steps_per_epoch,
            learning_rate=learning_rate,
            warmup_steps=warmup_steps,
            weight_decay=0.01,
            logging_dir=f"{output_dir}/logs",
            logging_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model="eval_cer",
            greater_is_better=False,
            fp16=torch.cuda.is_available(),
            report_to="tensorboard"
        )

        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        trainer.train()
        
        # Save the final model and tokenizer
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        logger.info(f"Model saved to {output_dir}")

    def transliterate(self, text: str, num_beams: int = 5) -> str:
        """Transliterate text with beam search"""
        self.model.eval()
        
        # Preprocess input
        text = TextPreprocessor.clean_text(text)
        text = "transliterate Bengali: " + text
        
        # Tokenize
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.device)

        # Generate with beam search
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=self.max_length,
                num_beams=num_beams,
                length_penalty=1.0,
                early_stopping=True,
                no_repeat_ngram_size=2,
                do_sample=False
            )

        # Decode prediction
        predicted_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return predicted_text

In [17]:
# Loading the dataset
ds = load_dataset("SKNahin/bengali-transliteration-data")

# Combining all splits into one
combined_df = pd.concat([split.to_pandas() for split in ds.values()], ignore_index=True)

# Saving the combined dataset to a single CSV file
combined_df.to_csv("data.csv", index=False)

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [None]:
# Training and evaluation
# Initialize the transliterator
transliterator = ImprovedBanglishBengaliTransliterator()

# Prepare data with augmentation
train_dataset, val_dataset = transliterator.prepare_data(
    "/kaggle/working/data.csv",
    apply_augmentation=True
)

# Train the model
transliterator.train(
    train_dataset,
    val_dataset,
    num_train_epochs=5,
    per_device_train_batch_size=8
)