In [None]:
pip install transformers[torch]

In [None]:
pip install accelerate -U

In [3]:
import transformers
import accelerate
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)

Transformers version: 4.40.1
Accelerate version: 0.30.0


In [4]:
from transformers import Trainer

In [2]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset,Dataset
from sklearn.model_selection import train_test_split

# Check if a GPU is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the dataset
dataset = load_dataset("SouthernCrossAI/Eng2German")
full_dataset = dataset['train']  # Assuming all data is in the 'train' split initially

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(full_dataset, test_size=0.1)
# Convert list of dicts back to Dataset objects
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

tokenizer = T5Tokenizer.from_pretrained("t5-base")

def preprocess_function(examples):
    # Tokenize the texts
    model_inputs = tokenizer(examples['English'], max_length=128, truncation=True, padding="max_length")
    # Prepare the target texts
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['German'], max_length=128, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/199379 [00:00<?, ? examples/s]



Map:   0%|          | 0/22154 [00:00<?, ? examples/s]

In [3]:
# Load the T5 model and move it to the correct device
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    push_to_hub=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

In [None]:
# Train the model
trainer.train()

# Save the model
model.save_pretrained("./t5_eng2de_translated")
trainer.tokenizer.save_pretrained("./t5_eng2de_translated")

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
def translate(text):
    model = T5ForConditionalGeneration.from_pretrained("./t5_eng2de_translated")
    tokenizer = T5Tokenizer.from_pretrained("./t5_eng2de_translated")

    # Encode the text
    input_ids = tokenizer.encode("translate English to German: " + text, return_tensors="pt")

    # Generate the translation
    outputs = model.generate(input_ids)

    # Decode the translated text
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

def main():
    print("English to German Translation")
    print("Type 'exit' to quit the translation tool.")

    while True:
        text = input("Enter text to translate: ")
        if text.lower() == 'exit':
            print("Exiting the translation tool.")
            break

        translated_text = translate(text)
        print(f"Translated Text: {translated_text}")

# Call the main function to start the translation tool
if __name__ == "__main__":
    main()