In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm

In [3]:
# Task 1: Load the Dataset
print("Loading dataset...")
dataset = load_dataset("SKNahin/bengali-transliteration-data")

if "train" in dataset:
    print("Splitting dataset into training and validation...")
    dataset = dataset["train"].train_test_split(test_size=0.2)
else:
    raise ValueError("The dataset does not have a 'train' split.")

train_data = dataset["train"]
val_data = dataset["test"]

print("Dataset successfully loaded and split.")
print("Training data size:", len(train_data))
print("Validation data size:", len(val_data))


Loading dataset...
Splitting dataset into training and validation...
Dataset successfully loaded and split.
Training data size: 4004
Validation data size: 1002


In [4]:
# Task 2: Data Preprocessing
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# The facebook/nllb-200-distilled-600M model and its tokenizer were used because the model is pre-trained for multilingual tasks, 
# and the tokenizer can handle multiple languages, including Bengali. The distilled version was used to reduce the model size and
# make training faster. 
# Our experiment showed that the tokenizer can handle Bengali Sentence in Latin script.

In [5]:
# Define the length thresholds for filtering (e.g., 5 to 128 tokens)
min_length = 5
max_length = 128

# Filter function to remove sentences that are too short or too long
def filter_lengths(batch):
    # Filter the sentences in 'rm' and 'bn' columns
    filtered_rm_bn = [
        (rm_text, bn_text) for rm_text, bn_text in zip(batch["rm"], batch["bn"])
        if min_length <= len(rm_text.split()) <= max_length and min_length <= len(bn_text.split()) <= max_length
    ]

    # Unzip the filtered pairs
    filtered_rm, filtered_bn = zip(*filtered_rm_bn) if filtered_rm_bn else ([], [])

    # Return the filtered batch
    return {
        "rm": list(filtered_rm),
        "bn": list(filtered_bn)
    }

# Apply the filtering function
print("Filtering dataset...")
train_data = train_data.map(filter_lengths, batched=True)
val_data = val_data.map(filter_lengths, batched=True)

# Remove examples with no valid entries (if the filtering process resulted in empty lists)
train_data = train_data.filter(lambda x: len(x["rm"]) > 0 and len(x["bn"]) > 0)
val_data = val_data.filter(lambda x: len(x["rm"]) > 0 and len(x["bn"]) > 0)

print(f"Filtered training data size: {len(train_data)}")
print(f"Filtered validation data size: {len(val_data)}")

Filtering dataset...


Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2643 [00:00<?, ? examples/s]

Filter:   0%|          | 0/674 [00:00<?, ? examples/s]

Filtered training data size: 2643
Filtered validation data size: 674


In [6]:
# Tokenization function (modified)
def tokenize_batch(batch):
    inputs = tokenizer(
        batch["rm"],
        max_length=128,  # Adjust max_length based on your dataset
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    targets = tokenizer(
        batch["bn"],
        max_length=128,  # Ensure same max_length as inputs
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    # Align labels with target input_ids
    inputs["labels"] = targets["input_ids"].tolist()  # Convert to list

    # Return only the necessary components
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": inputs["labels"]
    }

print("Tokenizing data...")
tokenized_train = train_data.map(tokenize_batch, batched=True, remove_columns=train_data.column_names)
tokenized_val = val_data.map(tokenize_batch, batched=True, remove_columns=val_data.column_names)

Tokenizing data...


Map:   0%|          | 0/2643 [00:00<?, ? examples/s]

Map:   0%|          | 0/674 [00:00<?, ? examples/s]

In [7]:
print(tokenized_train[0])

{'input_ids': [256047, 4575, 110, 330, 32631, 133, 925, 208033, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [256047, 211598, 214271, 89599, 16203, 11831, 248059, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [8]:
# Task 3: Select a Model
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

Loading model...


In [9]:
train_dataloader = DataLoader(
    tokenized_train, shuffle=True, batch_size=8, collate_fn=lambda x: tokenizer.pad(x, return_tensors="pt")
)
val_dataloader = DataLoader(
    tokenized_val, shuffle=False, batch_size=8, collate_fn=lambda x: tokenizer.pad(x, return_tensors="pt")
)

In [10]:
print("Sample tokenized train data:")
print(tokenized_train[0])
print("Lengths of input_ids and labels:")
print(len(tokenized_train[0]["input_ids"]))
print(len(tokenized_train[0]["labels"]))


Sample tokenized train data:
{'input_ids': [256047, 4575, 110, 330, 32631, 133, 925, 208033, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [256047, 211598, 214271, 89599, 16203, 11831, 248059, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [11]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
gradient_accumulation_steps = 1
max_grad_norm = 1.0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(val_dataloader)
    print(f"Validation loss: {avg_eval_loss}")

print("Training completed.")


Epoch 1/3


Training:   0%|          | 0/331 [00:00<?, ?it/s]You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Training: 100%|██████████| 331/331 [07:42<00:00,  1.40s/it]


Average training loss: 6.400282641552006


Evaluating: 100%|██████████| 85/85 [00:32<00:00,  2.59it/s]


Validation loss: 3.2001216046950396
Epoch 2/3


Training: 100%|██████████| 331/331 [07:40<00:00,  1.39s/it]


Average training loss: 1.4743137441553018


Evaluating: 100%|██████████| 85/85 [00:32<00:00,  2.59it/s]


Validation loss: 0.1923657254261129
Epoch 3/3


Training: 100%|██████████| 331/331 [07:40<00:00,  1.39s/it]


Average training loss: 0.10893020239796523


Evaluating: 100%|██████████| 85/85 [00:33<00:00,  2.57it/s]

Validation loss: 0.07537717453258878
Training completed.





In [12]:
pip install transformers datasets huggingface_hub



In [13]:
# Save model and tokenizer to a local directory
model_save_path = "./my_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)



('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/sentencepiece.bpe.model',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Path to the saved model and tokenizer directory
model_path = "./my_model"  # Replace this with your actual path

# Load the model and tokenizer from the saved directory
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the chosen device
model = model.to(device)

# Define the input text
input_text = "Ami tomar sathe kotha bolte chai"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Move inputs to the same device as the model (GPU or CPU)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Perform inference to generate the translation
with torch.no_grad():
    output = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

# Decode the generated tokens to get the translated text
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the translation
print("Input Text:", input_text)
print("Translated Text:", output_text)


Input Text: Ami tomar sathe kotha bolte chai
Translated Text: আমি তোমার সাথে কথা বলতে চাই 


In [15]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from huggingface_hub import upload_file

# Define the model directory and repository name
model_repo_name = "saugataroy/Willow_bn_lt_2_bn"

# Upload the model and tokenizer files
model.push_to_hub(model_repo_name)
tokenizer.push_to_hub(model_repo_name)


model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/32.2M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saugataroy/Willow_bn_lt_2_bn/commit/b29593089344afa6901e804ccb7b6f6285ddfffc', commit_message='Upload tokenizer', commit_description='', oid='b29593089344afa6901e804ccb7b6f6285ddfffc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saugataroy/Willow_bn_lt_2_bn', endpoint='https://huggingface.co', repo_type='model', repo_id='saugataroy/Willow_bn_lt_2_bn'), pr_revision=None, pr_num=None)