In [1]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import logging

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'datasets'

In [None]:
# Set up basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
class BengaliDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get Bengali and romanized text
        bengali = str(self.data[idx]['bn'])
        roman = str(self.data[idx]['rm'])
        
        # Tokenize Bengali text
        inputs = self.tokenizer(
            bengali, 
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        
        # Tokenize romanized text
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                roman,
                padding='max_length',
                truncation=True,
                max_length=64,
                return_tensors="pt"
            )

        return {
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'labels': labels.input_ids.squeeze()
        }

In [None]:
def train_model(model, train_dataloader, val_dataloader, num_epochs=3, device='cuda'):
    # Move model to device
    model.to(device)
    
    # Set up optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        # Training
        for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch+1}'):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_dataloader)
        logger.info(f'Epoch {epoch+1} - Average training loss: {avg_loss:.4f}')
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc='Validating'):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                val_loss += outputs.loss.item()
        
        avg_val_loss = val_loss / len(val_dataloader)
        logger.info(f'Epoch {epoch+1} - Validation loss: {avg_val_loss:.4f}')
    
    return model

In [None]:
# Check for GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
logger.info(f'Using device: {device}')

In [None]:
# Load dataset
logger.info('Loading dataset...')
dataset = load_dataset("SKNahin/bengali-transliteration-data")

In [None]:
# Split dataset
train_test = dataset['train'].train_test_split(test_size=0.2)
train_val = train_test['train'].train_test_split(test_size=0.1)

train_data = train_val['train']
val_data = train_val['test']
test_data = train_test['test']


In [None]:
# Initialize tokenizer and model
logger.info('Initializing tokenizer and model...')
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Set source and target languages
tokenizer.src_lang = "bn_IN"
tokenizer.tgt_lang = "en_XX"

In [None]:
# Create datasets
train_dataset = BengaliDataset(train_data, tokenizer)
val_dataset = BengaliDataset(val_data, tokenizer)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [None]:
# Train model
logger.info('Starting training...')
model = train_model(model, train_dataloader, val_dataloader, num_epochs=3, device=device)

In [None]:
# Save model
logger.info('Saving model...')
model.save_pretrained('./bengali_transliteration_model')
tokenizer.save_pretrained('./bengali_transliteration_model')

In [None]:
# Test transliteration
logger.info('Testing transliteration...')
model.eval()
test_text = "বাংলা"

In [None]:
inputs = tokenizer(test_text, return_tensors="pt").to(device)
translated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)

translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
logger.info(f'Bengali: {test_text}')
logger.info(f'Transliteration: {translation}')