In [3]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7



In [4]:
from datasets import load_dataset

dataset = load_dataset("SKNahin/bengali-transliteration-data")

print(dataset['train'].column_names)


['bn', 'rm']


In [6]:
train_test_split = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train'].select(range(1000))
val_dataset = dataset['train'].select(range(1000, 1200))


print(f"Training examples: {len(train_dataset)}, Validation examples: {len(val_dataset)}")


Training examples: 1000, Validation examples: 1000


In [7]:
from transformers import MBart50Tokenizer

tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")

tokenizer.src_lang = "en_XX"  
tokenizer.tgt_lang = "bn_IN"  


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



In [8]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['rm'], max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['bn'], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

print(train_dataset[0]) 


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'bn': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'rm': 'scroll kore 20/30 second er video pann nai???', 'input_ids': [250004, 192046, 20867, 387, 108355, 17932, 72, 1202, 2652, 19, 24, 14, 7273, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [250028, 6, 67734, 174451, 2763, 5507, 32427, 64, 152404, 22540, 2937, 36358, 11785, 38727, 201153, 116930, 7273, 2]}


In [9]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['rm'], max_length=128, padding='max_length', truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['bn'], max_length=128, padding='max_length', truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

print(train_dataset[0])  


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'bn': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'rm': 'scroll kore 20/30 second er video pann nai???', 'input_ids': [250004, 192046, 20867, 387, 108355, 17932, 72, 1202, 2652, 19, 24, 14, 7273, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [250028, 6, 67734, 174451, 2763, 5507,

In [10]:
from transformers import MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [11]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",          
    evaluation_strategy="epoch",     
    learning_rate=5e-5,              
    per_device_train_batch_size=2,   
    per_device_eval_batch_size=2,     
    weight_decay=0.001,              
    save_total_limit=3,             
    num_train_epochs=10,             
    predict_with_generate=True,      
    logging_dir="./logs",            
    logging_steps=50,                
    do_train=True,                   
    do_eval=True                     
)




In [12]:

from transformers import Seq2SeqTrainer

# Define trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [None]:
from datasets import load_metric

metric = load_metric("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {"bleu": metric.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])}

# Add compute_metrics to trainer
trainer.compute_metrics = compute_metrics
trainer.evaluate()


In [None]:
model.save_pretrained("./banglish-to-bangla-model")
tokenizer.save_pretrained("./banglish-to-bangla-tokenizer")


In [None]:
from transformers import pipeline

# Load pipeline
translator = pipeline("translation", model="./banglish-to-bangla-model", tokenizer="./banglish-to-bangla-tokenizer")

# Translate example Banglish text
output = translator("ami test korechi")
print(output)
