In [1]:
import numpy as np
import matplotlib.pyplot as plt


In [9]:
pip install datasets



In [10]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset into training and validation subsets
train_valid_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = train_valid_split["train"]
valid_dataset = train_valid_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(valid_dataset)}")

Training samples: 4004
Validation samples: 1002


In [11]:
pip install transformers



In [16]:
from transformers import AutoTokenizer

# Choose a tokenizer based on the model you select later
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

# Preprocessing function
def preprocess_data(batch):
    inputs = tokenizer(batch["rm"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(batch["bn"], truncation=True, padding="max_length", max_length=128)
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True)
valid_dataset = valid_dataset.map(preprocess_data, batched=True)


tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Map:   0%|          | 0/4004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [20]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    seed=42,

)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,3.2074,1.280978
2,0.5869,0.491976
3,0.5495,0.455454
4,0.5158,0.442038
5,0.5155,0.438129


TrainOutput(global_step=1255, training_loss=2.4372400504184433, metrics={'train_runtime': 2028.1774, 'train_samples_per_second': 9.871, 'train_steps_per_second': 0.619, 'total_flos': 4598341633966080.0, 'train_loss': 2.4372400504184433, 'epoch': 5.0})

In [22]:
def generate_predictions(batch):
    inputs = tokenizer(batch["rm"], return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    # Generate predictions
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=128)

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {"predictions": decoded_preds}

# Generate predictions for the test dataset
test_results = valid_dataset.map(generate_predictions, batched=True, batch_size=16)

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [24]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [26]:
import evaluate

# Load the BLEU metric
bleu = evaluate.load("bleu")

# Prepare references and predictions
references = [[ref] for ref in test_results["bn"]]
predictions = test_results["predictions"]

# Compute BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"BLEU Score: {bleu_score['bleu']:.4f}")


BLEU Score: 0.0069
