In [1]:
!pip install transformers datasets sacrebleu nltk torch sentencepiece tqdm

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting color

In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from nltk.tokenize import word_tokenize
import nltk
import logging
import os
from google.colab import files
from google.colab import drive

nltk.download('punkt', quiet=True)
logging.basicConfig(level=logging.INFO)

# Mount Google Drive
drive.mount('/content/drive')

def load_models_and_tokenizers():
    zh_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
    zh_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    zh_en_model.to(device)

    return zh_en_model, zh_en_tokenizer, device

def preprocess_function(examples, tokenizer, src_lang, tgt_lang):
    src_processed = [" ".join(text.split()) for text in examples[src_lang]]
    tgt_processed = [" ".join(word_tokenize(text.lower())) for text in examples[tgt_lang]]

    model_inputs = tokenizer(src_processed, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(tgt_processed, max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def train_model(model, tokenizer, train_dataset, val_dataset, output_dir, src_lang, tgt_lang, num_epochs=30):
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=100,
        save_steps=500,
        save_total_limit=2,
        learning_rate=5e-5,
        fp16=True,
        gradient_accumulation_steps=2,
        evaluation_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

    train_dataset = train_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer, src_lang, tgt_lang),
        batched=True,
        remove_columns=train_dataset.column_names,
        batch_size=100
    )

    val_dataset = val_dataset.map(
        lambda examples: preprocess_function(examples, tokenizer, src_lang, tgt_lang),
        batched=True,
        remove_columns=val_dataset.column_names,
        batch_size=100
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model(output_dir)
    return trainer.model, trainer.tokenizer

def translate(text, model, tokenizer, device, max_length=64):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

def main():
    print("Chinese to English Translation Model Training")

    zh_en_model, zh_en_tokenizer, device = load_models_and_tokenizers()

    print("Loading and preparing dataset")
    full_dataset = load_dataset("Aye10032/zh-en-translate-20k", split="train")

    train_size = int(0.9 * len(full_dataset))
    val_size = len(full_dataset) - train_size

    train_dataset = full_dataset.select(range(train_size))
    val_dataset = full_dataset.select(range(train_size, len(full_dataset)))

    print("Train dataset size:", len(train_dataset))
    print("Validation dataset size:", len(val_dataset))

    print("Training Chinese to English model...")
    zh_en_output_dir = "/content/drive/MyDrive/trained_zh_en_model"
    trained_zh_en_model, trained_zh_en_tokenizer = train_model(zh_en_model, zh_en_tokenizer, train_dataset, val_dataset, zh_en_output_dir, "chinese", "english", num_epochs=30)

    print("Model training complete!")

    # Save the model and tokenizer
    print("Saving model and tokenizer...")
    trained_zh_en_model.save_pretrained(zh_en_output_dir, safe_serialization=True)
    trained_zh_en_tokenizer.save_pretrained(zh_en_output_dir)

    print("Model and tokenizer saved to Google Drive.")
    print("You can now access the files in your Google Drive under the 'trained_zh_en_model' folder.")

    print("\nTranslation example:")
    chinese_text = "你好世界"

    print(f"Chinese to English:")
    print(f"Input: {chinese_text}")
    print(f"Output: {translate(chinese_text, trained_zh_en_model, trained_zh_en_tokenizer, device)}")

if __name__ == "__main__":
    main()

Mounted at /content/drive
Chinese to English Translation Model Training


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



Loading and preparing dataset


Downloading readme:   0%|          | 0.00/455 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/193k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20127 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2237 [00:00<?, ? examples/s]

Train dataset size: 18114
Validation dataset size: 2013
Training Chinese to English model...




Map:   0%|          | 0/18114 [00:00<?, ? examples/s]

Map:   0%|          | 0/2013 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
500,0.9769,0.852113
1000,0.7086,0.65544
1500,0.5551,0.582176
2000,0.45,0.543439
2500,0.3869,0.525608
3000,0.3233,0.509275
3500,0.2718,0.504122
4000,0.257,0.502389
4500,0.2438,0.49504
5000,0.2074,0.500455


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


Model training complete!
Saving model and tokenizer...
Model and tokenizer saved to Google Drive.
You can now access the files in your Google Drive under the 'trained_zh_en_model' folder.

Translation example:
Chinese to English:
Input: 你好世界
Output: how life you live in the world.
