In [None]:
!pip install datasets transformers[sentencepiece]
!apt install git-lfs
!pip install numpy
!pip install transformers
!pip install sacrebleu
!pip install accelerate
!pip install tqdm
!pip install huggingface_hub

!git config --global user.email "XXXX
!git config --global user.name "XXXX"
!git config --global credential.helper store

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

# XXXXXX

!git config --global user.email "XXXX"
!git config --global user.name "XXXXX"
!git config --global credential.helper store

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
from transformers import M2M100Tokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric, Dataset
from huggingface_hub import notebook_login
import numpy as np
import torch

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")

split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")

metric = load_metric("sacrebleu")

model_checkpoint = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_checkpoint)
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "fr"

max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]

    tmps_index = 0
    for i in inputs:
      tmps_index += 1

      if tmps_index >= 1000:
        inputs.remove(i)

    tmps_index = 0
    for i in targets:
      tmps_index += 1

      if tmps_index >= 1000:
        targets.remove(i)

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

del raw_datasets
del split_datasets

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

args = Seq2SeqTrainingArguments(
    f"m2m100_418M-fr",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_accumulation_steps=3,
    predict_with_generate=True,
    push_to_hub=True,
    do_train=True,
    do_eval=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# trainer.evaluate(max_length=max_target_length)
trainer.train()
# trainer.evaluate(max_length=max_target_length)
trainer.push_to_hub(tags="translation", commit_message="Training complete")


Using custom data configuration en-fr-lang1=en,lang2=fr
Reusing dataset kde4 (/root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac/cache-8f0f00a528fc2fcd.arrow and /root/.cache/huggingface/datasets/kde4/en-fr-lang1=en,lang2=fr/0.0.0/243129fb2398d5b0b4f7f6831ab27ad84774b7ce374cf10f60f6e1ff331648ac/cache-f0e27150032f24be.arrow


  0%|          | 0/190 [00:00<?, ?ba/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/1.80G [00:00<?, ?B/s]

Cloning https://huggingface.co/Jour/m2m100_418M-fr into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.47k/1.80G [00:00<?, ?B/s]

Download file runs/Nov29_00-32-07_jourdelune-OptiPlex-3050/events.out.tfevents.1638142381.jourdelune-OptiPlex-…

Download file runs/Nov29_00-19-11_jourdelune-OptiPlex-3050/events.out.tfevents.1638141611.jourdelune-OptiPlex-…

Download file runs/Nov29_00-16-42_jourdelune-OptiPlex-3050/1638141460.9573853/events.out.tfevents.1638141460.j…

Download file runs/Nov29_00-32-07_jourdelune-OptiPlex-3050/1638142381.2564037/events.out.tfevents.1638142381.j…

Download file training_args.bin: 100%|##########| 3.05k/3.05k [00:00<?, ?B/s]

Download file runs/Nov29_00-19-11_jourdelune-OptiPlex-3050/1638141611.3880417/events.out.tfevents.1638141611.j…

Clean file runs/Nov29_00-32-07_jourdelune-OptiPlex-3050/events.out.tfevents.1638142381.jourdelune-OptiPlex-305…

Clean file runs/Nov29_00-19-11_jourdelune-OptiPlex-3050/events.out.tfevents.1638141611.jourdelune-OptiPlex-305…

Clean file runs/Nov29_00-16-42_jourdelune-OptiPlex-3050/1638141460.9573853/events.out.tfevents.1638141460.jour…

Clean file runs/Nov29_00-32-07_jourdelune-OptiPlex-3050/1638142381.2564037/events.out.tfevents.1638142381.jour…

Download file runs/Nov29_00-16-42_jourdelune-OptiPlex-3050/events.out.tfevents.1638141460.jourdelune-OptiPlex-…

Download file sentencepiece.bpe.model:   0%|          | 3.48k/2.31M [00:00<?, ?B/s]

Clean file training_args.bin:  33%|###2      | 1.00k/3.05k [00:00<?, ?B/s]

Clean file runs/Nov29_00-19-11_jourdelune-OptiPlex-3050/1638141611.3880417/events.out.tfevents.1638141611.jour…

Clean file runs/Nov29_00-16-42_jourdelune-OptiPlex-3050/events.out.tfevents.1638141460.jourdelune-OptiPlex-305…

Clean file sentencepiece.bpe.model:   0%|          | 1.00k/2.31M [00:00<?, ?B/s]