# Install Dependencies

Install huggingface datasets library and transformers library.

In [1]:
! pip install datasets transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

# Load The Datasets

Dataset link: https://huggingface.co/datasets/SKNahin/bengali-transliteration-data

In [2]:
from datasets import load_dataset

# load the dataset
ds = load_dataset("SKNahin/bengali-transliteration-data", split='train')
print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Dataset({
    features: ['bn', 'rm'],
    num_rows: 5006
})


# Preprocessing

eg: Remove too long or too short sentences

In [3]:
# Get Minimum & Maximum lengths of the sequences
# min_len, max_len = 0, 0
# for data in ds['rm']:
#     max_len = max(max_len, len(data))
#     min_len = min(min_len, len(data))
# print(min_len, max_len)


# threshold values for truncating dataset
min_len, max_len = 1, 200
excluded_idx = []

for i in range(0, len(ds['rm'])):
    sz = len(ds['rm'][i])
    if sz < min_len or sz > max_len:
        excluded_idx.append(i)

print(f"{excluded_idx = }")

# create new dataset dropping the excluded rows
dataset = ds.select((
    i for i in range(0, len(ds['rm']))
    if i not in excluded_idx
))

print(f"{dataset = }")



excluded_idx = [136, 385, 532, 540]
dataset = Dataset({
    features: ['bn', 'rm'],
    num_rows: 5002
})


# Do Train Test Split

In [4]:
# split the dataset into train and test
split_data = dataset.train_test_split(test_size=0.2, seed=42)
train_data = split_data['train']
test_data = split_data['test']

# print train-test data metadata
print(f"{train_data = }\n\n{test_data = }")

Filter:   0%|          | 0/4001 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1001 [00:00<?, ? examples/s]

train_data = Dataset({
    features: ['bn', 'rm'],
    num_rows: 4001
})

test_data = Dataset({
    features: ['bn', 'rm'],
    num_rows: 1001
})


# Import Libraries for Training the Model

In [5]:
import torch
from transformers import (
    MBart50Tokenizer,
    MBartForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

# Model Selection and Loading

We have used mBART model for our translation task.

In [6]:
model_name = "facebook/mbart-large-50"

In [7]:
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [8]:
# define language codes
source_lang = "en_XX"  # BANGLISH
target_lang = "bn_BN"  # PURE BENGALI

# ensure the tokenizer supports the target language
if target_lang not in tokenizer.lang_code_to_id:
    tokenizer.lang_code_to_id[target_lang] = len(tokenizer.lang_code_to_id)
    tokenizer.id_to_lang_code[len(tokenizer.id_to_lang_code)] = target_lang

# function to tokenize the dataset
def preprocess_function(example_batch):
    inputs = [f"<{source_lang}> {text}" for text in example_batch["rm"]]  # add source language token
    targets = [f"<{target_lang}> {text}" for text in example_batch["bn"]]  # add target language token

    # input_encodings = tokenizer(example_batch["rm"], max_length=1024, padding="max_length",truncation=True)
    # target_encodings = tokenizer(example_batch["bn"], max_length=1024, padding="max_length", truncation=True)

    input_encodings = tokenizer(
        inputs,
        max_length=1024,
        padding="max_length",
        truncation=True
    )

    target_encodings = tokenizer(
        targets,
        max_length=1024,
        padding="max_length",
        truncation=True
    )

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }


In [9]:
# tokenize train and test datasets
train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/4001 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
# training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=2,
    # num_train_epochs=1,
    max_steps=1000,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    report_to="none",
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.0379,0.028246




TrainOutput(global_step=1000, training_loss=0.8452933893203736, metrics={'train_runtime': 1033.7016, 'train_samples_per_second': 0.967, 'train_steps_per_second': 0.967, 'total_flos': 2167129767936000.0, 'train_loss': 0.8452933893203736, 'epoch': 0.24993751562109473})

In [14]:
trainer.save_model("./banglish-to-bengali")
tokenizer.save_pretrained("./banglish-to-bengali")

('./banglish-to-bengali/tokenizer_config.json',
 './banglish-to-bengali/special_tokens_map.json',
 './banglish-to-bengali/sentencepiece.bpe.model',
 './banglish-to-bengali/added_tokens.json')

# Evaluation

In [15]:
results = trainer.evaluate()
print("Evaluation Results: ", results)

KeyboardInterrupt: 

# Generate Output from User Input

In [19]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer

model_path = "./results/checkpoint-1000"
model = MBartForConditionalGeneration.from_pretrained(model_path)
tokenizer = MBart50Tokenizer.from_pretrained(model_path)

In [22]:
def translate_banglish_to_bengali(prompt: str) -> str:
    input_text = f"<en_XX> {prompt}"    # add language tag

    input_ids = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).input_ids

    # generate translation
    outputs = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True)

    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text.replace('<bn_BN>', '').strip()

In [25]:
prompts = [
    'vai eita kono kotha?',
    'tui shala ek nambar er kepta',
    'pasa thakla pasa paban',
    'ami vaat khabo vai ekhon'
]

for user_prompt in prompts:
    translated_output = translate_banglish_to_bengali(user_prompt)

    print("User Input (Banglish):", user_prompt)
    print("Translated Output (Ashol Bangla):", translated_output)

User Input (Banglish): vai eita kono kotha?
Translated Output (Ashol Bangla): ভাই এইটা কোনো কথা?
User Input (Banglish): tui shala ek nambar er kepta
Translated Output (Ashol Bangla): tui হালরে একটা নাম্বার এর জন্য
User Input (Banglish): pasa thakla pasa paban
Translated Output (Ashol Bangla): কিছু বলা যায় কিছু পাচ্ছে
User Input (Banglish): ami vaat khabo vai ekhon
Translated Output (Ashol Bangla): আমি হইবো ভাই আছে


In [26]:
! pip install huggingface_hub



In [31]:
# !huggingface-cli login
# !transformers-cli login

2024-12-21 15:45:57.906671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-21 15:45:57.926029: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-21 15:45:57.931809: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[1m[31mERROR! `huggingface-cli login` uses an outdated login mechanism that is not compatible with the Hugging Face Hub backend anymore. Please use `huggingface-cli login instead.[0m
