In [1]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import os

class BengaliDataset(Dataset):
    def __init__(self, data, tokenizer, max_length_source=128, max_length_target=64):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length_source = max_length_source
        self.max_length_target = max_length_target
        
    def __len__(self):
        return len(self.data)
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = text.strip()
        text = ' '.join(text.split())
        return text
    
    def __getitem__(self, idx):
        # Get and clean Banglish and Bangla text
        banglish = self.clean_text(str(self.data[idx]['rm']))
        bangla = self.clean_text(str(self.data[idx]['bn']))
        
        # Tokenize Banglish text
        inputs = self.tokenizer(
            banglish,
            padding='max_length',
            truncation=True,
            max_length=self.max_length_source,
            return_tensors="pt"
        )
        
        # Tokenize Bangla text
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                bangla,
                padding='max_length',
                truncation=True,
                max_length=self.max_length_target,
                return_tensors="pt"
            )
        
        return {
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'labels': labels.input_ids.squeeze()
        }

def train_model(model, train_dataloader, val_dataloader, 
                device='cuda',
                num_epochs=5,
                learning_rate=2e-5,
                warmup_steps=500,
                gradient_accumulation_steps=4,
                max_grad_norm=1.0,
                save_path='./best_model'):
    """Train the model with explicit saving and progress tracking"""
    
    print(f'\nTraining Configuration:')
    print(f'Learning Rate: {learning_rate}')
    print(f'Number of Epochs: {num_epochs}')
    print(f'Warmup Steps: {warmup_steps}')
    print(f'Save Path: {save_path}')
    print('-' * 50)
    
    # Set up optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    # Set up scheduler
    num_training_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps
    )
    
    # Training loop
    best_val_loss = float('inf')
    model_save_successful = False
    
    for epoch in range(num_epochs):
        print(f'\nStarting Epoch {epoch + 1}/{num_epochs}')
        
        # Training phase
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(train_dataloader, desc=f'Training Epoch {epoch+1}')
        for step, batch in enumerate(progress_bar):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps
            
            # Backward pass
            loss.backward()
            
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            
            total_loss += loss.item() * gradient_accumulation_steps
            progress_bar.set_postfix({'loss': loss.item() * gradient_accumulation_steps})
        
        avg_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch+1} - Average training loss: {avg_loss:.4f}')
        
        # Validation phase
        model.eval()
        val_loss = 0
        print('\nStarting validation...')
        
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc='Validating'):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                val_loss += outputs.loss.item()
        
        avg_val_loss = val_loss / len(val_dataloader)
        print(f'Epoch {epoch+1} - Validation loss: {avg_val_loss:.4f}')
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            try:
                print(f'\nSaving model to {save_path}...')
                os.makedirs(save_path, exist_ok=True)
                model.save_pretrained(save_path)
                print(f'Model successfully saved to {save_path}')
                model_save_successful = True
            except Exception as e:
                print(f'Error saving model: {str(e)}')
                model_save_successful = False
    
    if not model_save_successful:
        print('\nWARNING: Model was not saved successfully during training!')
    
    return model, model_save_successful

def test_translations(model, tokenizer, test_texts, device):
    """Test the model with multiple translations"""
    print('\nStarting translation tests...')
    model.eval()
    results = []
    
    for test_text in test_texts:
        try:
            print(f'\nTranslating: {test_text}')
            inputs = tokenizer(test_text, return_tensors="pt").to(device)
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    forced_bos_token_id=tokenizer.lang_code_to_id["bn_IN"],
                    num_beams=5,
                    max_length=64,
                    early_stopping=True
                )
            
            bangla_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
            results.append({
                'input': test_text,
                'output': bangla_text
            })
            print(f'Banglish: {test_text}')
            print(f'Bangla: {bangla_text}')
            print('-' * 50)
            
        except Exception as e:
            print(f'Error translating "{test_text}": {str(e)}')
    
    return results

def main():
    # Set device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using device: {device}')
    
    # Initialize tokenizer and model
    print('\nLoading tokenizer and model...')
    model_name = "facebook/mbart-large-50-many-to-many-mmt"
    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    
    # Set source and target languages
    tokenizer.src_lang = "en_XX"
    tokenizer.tgt_lang = "bn_IN"
    
    # Move model to device
    model.to(device)
    
    # Load and prepare data
    print('\nLoading dataset...')
    dataset = load_dataset("SKNahin/bengali-transliteration-data")
    
    print('\nPreparing data splits...')
    train_test = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
    train_val = train_test['train'].train_test_split(test_size=0.1, shuffle=True, seed=42)
    
    print('Creating datasets and dataloaders...')
    train_dataset = BengaliDataset(train_val['train'], tokenizer)
    val_dataset = BengaliDataset(train_val['test'], tokenizer)
    
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=8,
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )
    
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=16,
        num_workers=2,
        pin_memory=True
    )
    
    # Train model
    print('\nStarting training process...')
    save_path = './best_model'
    model, save_successful = train_model(
        model,
        train_dataloader,
        val_dataloader,
        device=device,
        num_epochs=10,
        learning_rate=2e-5,
        warmup_steps=500,
        save_path=save_path
    )
    
    if not save_successful:
        print('\nERROR: Model was not saved properly. Please check disk space and permissions.')
        return
    
    # Test translations
    test_texts = [
        "bangla amar matribhasha",
        "ami tomake bhalobashi",
        "kemon acho bondhu"
    ]
    
    results = test_translations(model, tokenizer, test_texts, device)
    
    # Final summary
    print('\nProcess Summary:')
    print(f'Model saved at: {save_path}')
    print(f'Number of test translations completed: {len(results)}')
    print('\nProcess completed!')

if __name__ == "__main__":
    main()

Using device: cuda

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]


Loading dataset...


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]


Preparing data splits...
Creating datasets and dataloaders...

Starting training process...

Training Configuration:
Learning Rate: 2e-05
Number of Epochs: 10
Warmup Steps: 500
Save Path: ./best_model
--------------------------------------------------

Starting Epoch 1/10


Training Epoch 1: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=8.38]


Epoch 1 - Average training loss: 10.0374

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.87it/s]


Epoch 1 - Validation loss: 8.2998

Saving model to ./best_model...


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Model successfully saved to ./best_model

Starting Epoch 2/10


Training Epoch 2: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=3.03]


Epoch 2 - Average training loss: 6.0531

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 2 - Validation loss: 2.9149

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 3/10


Training Epoch 3: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.383]


Epoch 3 - Average training loss: 1.1175

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.87it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 3 - Validation loss: 0.3479

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 4/10


Training Epoch 4: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.0555]


Epoch 4 - Average training loss: 0.2604

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 4 - Validation loss: 0.2254

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 5/10


Training Epoch 5: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.113] 


Epoch 5 - Average training loss: 0.1454

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 5 - Validation loss: 0.1885

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 6/10


Training Epoch 6: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.315] 


Epoch 6 - Average training loss: 0.0919

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 6 - Validation loss: 0.1735

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 7/10


Training Epoch 7: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.0917]


Epoch 7 - Average training loss: 0.0617

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 7 - Validation loss: 0.1643

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 8/10


Training Epoch 8: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.0725] 


Epoch 8 - Average training loss: 0.0453

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.87it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 8 - Validation loss: 0.1582

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 9/10


Training Epoch 9: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.00563]


Epoch 9 - Average training loss: 0.0345

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 9 - Validation loss: 0.1578

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting Epoch 10/10


Training Epoch 10: 100%|██████████| 451/451 [03:19<00:00,  2.26it/s, loss=0.0414] 


Epoch 10 - Average training loss: 0.0273

Starting validation...


Validating: 100%|██████████| 26/26 [00:06<00:00,  3.88it/s]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


Epoch 10 - Validation loss: 0.1531

Saving model to ./best_model...
Model successfully saved to ./best_model

Starting translation tests...

Translating: bangla amar matribhasha
Banglish: bangla amar matribhasha
Bangla: বাংলা আমার মাধ্যমে
--------------------------------------------------

Translating: ami tomake bhalobashi
Banglish: ami tomake bhalobashi
Bangla: আমি তোমাকে ভালোবাসি
--------------------------------------------------

Translating: kemon acho bondhu
Banglish: kemon acho bondhu
Bangla: কেমন এত বন্ধু
--------------------------------------------------

Process Summary:
Model saved at: ./best_model
Number of test translations completed: 3

Process completed!
