# Text Translation

In [17]:
#!pip install transformers datasets evaluate sacrebleu sacremoses sentencepiece

In [1]:
import numpy as np
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer,
                          DataCollatorForSeq2Seq)
from transformers import pipeline
import evaluate
metric = evaluate.load("sacrebleu")

### Example of Text Translation

In [3]:
text = "Hello, how are you?"

translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es")

output = translator(text)[0]['translation_text']

print("\nTranslation Output:")
print(output)

Device set to use cuda:0



Translation Output:
Hola, ¿cómo estás?


### Full Text Translation Workflow

1- Dataset Preparation

In [5]:
dataset = load_dataset("opus100", "en-es", split={"train": "train[:5%]", "test": "test[:5%]"})
print(dataset['train'][0])

{'translation': {'en': "It was the asbestos in here, that's what did it!", 'es': 'Fueron los asbestos aquí. ¡Eso es lo que ocurrió!'}}


2- Tokenizer Initialization

In [6]:
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = AutoTokenizer.from_pretrained(model_name)

3- Data Preprocessing

In [7]:
def preprocess_function(examples):
    source_texts = [ex["en"] for ex in examples["translation"]]
    target_texts = [ex["es"] for ex in examples["translation"]]

    inputs = tokenizer(source_texts, max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length")

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

4- Model Loading

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

5- Data Collation & Training Configuration

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

6- Evaluation Metrics

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    if isinstance(predictions, tuple):
        predictions = predictions[0]  
    
    predictions = np.argmax(predictions, axis=-1) 

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    references = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=references)

    return {"bleu": result["score"]}


7- Model Training & Evaluation

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,0.1229,0.326156,30.307175
2,0.1453,0.310714,32.074927


TrainOutput(global_step=12500, training_loss=0.12212513061523438, metrics={'train_runtime': 3502.9131, 'train_samples_per_second': 28.548, 'train_steps_per_second': 3.568, 'total_flos': 1.35593459712e+16, 'train_loss': 0.12212513061523438, 'epoch': 2.0})

8- Model Saving & Inference

In [16]:
trainer.save_model("./fine_tuned_tr")
tokenizer.save_pretrained("./fine_tuned_tr")

translator = pipeline("translation_en_to_es", model="./fine_tuned_tr")

text = "Hello, how are you?"
translated_text = translator(text)[0]['translation_text']
print(f"Translated Text: {translated_text}")

Device set to use cuda:0


Translated Text: Hola, ¿cómo estás?
