In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np


In [3]:
checkpoint = "t5-small"

In [4]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

In [5]:
books["train"][0]

{'id': '112391',
 'translation': {'en': 'Black night reigned again; and there was no hope left of being able to dissipate the palpable darkness.',
  'fr': "L'obscurité redevint absolue. Il ne fallait plus songer à dissiper ces impénétrables ténèbres."}}

In [6]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [8]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
metric = evaluate.load("sacrebleu")

In [11]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Train

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_opus_books_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,                                                                                                                                                                                                                         
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
    tokenizer=tokenizer,
    data_collator=data_collator,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
    compute_metrics=compute_metrics,
)

# trainer.train()

  0%|          | 0/12710 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  4%|▍         | 500/12710 [09:47<4:05:12,  1.20s/it]Checkpoint destination directory my_awesome_opus_books_model/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 2.0929, 'learning_rate': 1.9213217938631003e-05, 'epoch': 0.08}


  8%|▊         | 1000/12710 [19:38<4:01:09,  1.24s/it]

{'loss': 2.0004, 'learning_rate': 1.8426435877261997e-05, 'epoch': 0.16}


 12%|█▏        | 1500/12710 [29:28<3:04:02,  1.02it/s]

{'loss': 1.9677, 'learning_rate': 1.7641227380015738e-05, 'epoch': 0.24}


 16%|█▌        | 2000/12710 [39:17<3:15:25,  1.09s/it]

{'loss': 1.9386, 'learning_rate': 1.6854445318646736e-05, 'epoch': 0.31}


 20%|█▉        | 2500/12710 [49:07<2:52:18,  1.01s/it]

{'loss': 1.9146, 'learning_rate': 1.6067663257277734e-05, 'epoch': 0.39}


 24%|██▎       | 3000/12710 [59:04<2:58:12,  1.10s/it]

{'loss': 1.8917, 'learning_rate': 1.5280881195908735e-05, 'epoch': 0.47}


 28%|██▊       | 3500/12710 [1:08:54<3:06:36,  1.22s/it]

{'loss': 1.9084, 'learning_rate': 1.4494099134539735e-05, 'epoch': 0.55}


 31%|███▏      | 4000/12710 [1:18:44<2:38:25,  1.09s/it]

{'loss': 1.8863, 'learning_rate': 1.3707317073170734e-05, 'epoch': 0.63}


 35%|███▌      | 4500/12710 [1:28:37<2:25:00,  1.06s/it]

{'loss': 1.8802, 'learning_rate': 1.2920535011801732e-05, 'epoch': 0.71}


 39%|███▉      | 5000/12710 [1:38:33<2:37:23,  1.22s/it]

{'loss': 1.8655, 'learning_rate': 1.2133752950432732e-05, 'epoch': 0.79}


 43%|████▎     | 5500/12710 [1:48:25<2:16:07,  1.13s/it]

{'loss': 1.8654, 'learning_rate': 1.1348544453186468e-05, 'epoch': 0.87}


 47%|████▋     | 6000/12710 [1:58:16<2:14:10,  1.20s/it]

{'loss': 1.8646, 'learning_rate': 1.0561762391817467e-05, 'epoch': 0.94}


                                                        
 50%|█████     | 6355/12710 [2:21:36<1:36:21,  1.10it/s]

{'eval_loss': 1.634561538696289, 'eval_bleu': 5.4932, 'eval_gen_len': 17.6202, 'eval_runtime': 981.0866, 'eval_samples_per_second': 25.907, 'eval_steps_per_second': 1.62, 'epoch': 1.0}


 51%|█████     | 6500/12710 [2:24:25<1:56:35,  1.13s/it]   

{'loss': 1.8615, 'learning_rate': 9.776553894571205e-06, 'epoch': 1.02}


 55%|█████▌    | 7000/12710 [2:34:23<2:12:56,  1.40s/it]

{'loss': 1.8411, 'learning_rate': 8.989771833202203e-06, 'epoch': 1.1}


 59%|█████▉    | 7500/12710 [2:44:22<1:54:27,  1.32s/it]

{'loss': 1.8462, 'learning_rate': 8.202989771833204e-06, 'epoch': 1.18}


 63%|██████▎   | 8000/12710 [2:54:10<1:16:44,  1.02it/s]

{'loss': 1.8322, 'learning_rate': 7.416207710464202e-06, 'epoch': 1.26}


 67%|██████▋   | 8500/12710 [3:04:10<1:27:58,  1.25s/it]

{'loss': 1.8249, 'learning_rate': 6.629425649095201e-06, 'epoch': 1.34}


 71%|███████   | 9000/12710 [3:14:07<1:12:32,  1.17s/it]

{'loss': 1.8276, 'learning_rate': 5.8426435877262e-06, 'epoch': 1.42}


 75%|███████▍  | 9500/12710 [3:24:02<1:05:17,  1.22s/it]

{'loss': 1.8248, 'learning_rate': 5.0558615263571994e-06, 'epoch': 1.49}


 79%|███████▊  | 10000/12710 [3:34:04<45:51,  1.02s/it] 

{'loss': 1.8149, 'learning_rate': 4.269079464988198e-06, 'epoch': 1.57}


 83%|████████▎ | 10500/12710 [3:44:01<48:55,  1.33s/it]  

{'loss': 1.8284, 'learning_rate': 3.4838709677419357e-06, 'epoch': 1.65}


 87%|████████▋ | 11000/12710 [3:54:09<35:12,  1.24s/it]  

{'loss': 1.8252, 'learning_rate': 2.6986624704956728e-06, 'epoch': 1.73}


 90%|█████████ | 11500/12710 [4:04:11<23:47,  1.18s/it]

{'loss': 1.8128, 'learning_rate': 1.911880409126672e-06, 'epoch': 1.81}


 94%|█████████▍| 12000/12710 [4:14:10<14:25,  1.22s/it]

{'loss': 1.8134, 'learning_rate': 1.1250983477576713e-06, 'epoch': 1.89}


 98%|█████████▊| 12500/12710 [4:24:07<03:38,  1.04s/it]

{'loss': 1.8248, 'learning_rate': 3.3831628638867033e-07, 'epoch': 1.97}


                                                       
100%|██████████| 12710/12710 [4:44:29<00:00,  1.34s/it]

{'eval_loss': 1.6118154525756836, 'eval_bleu': 5.6954, 'eval_gen_len': 17.6085, 'eval_runtime': 977.3471, 'eval_samples_per_second': 26.006, 'eval_steps_per_second': 1.626, 'epoch': 2.0}
{'train_runtime': 17069.4332, 'train_samples_per_second': 11.912, 'train_steps_per_second': 0.745, 'train_loss': 1.87324542654114, 'epoch': 2.0}





TrainOutput(global_step=12710, training_loss=1.87324542654114, metrics={'train_runtime': 17069.4332, 'train_samples_per_second': 11.912, 'train_steps_per_second': 0.745, 'train_loss': 1.87324542654114, 'epoch': 2.0})

In [13]:
import torch
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 Ti'

### Inference

In [14]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
text_list = [
    "translate English to French: Legumes share resources with nitrogen-fixing bacteria.",
    "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
]

In [15]:
text_list_en = [trans["en"] for trans in books["train"][1:10]["translation"]]
# text_list_en
text_list_fr = [trans["fr"] for trans in books["train"][1:10]["translation"]]
text_list_fr

["Le bon Jacques court à son secours, l'aide à remonter, et de l'effort qu'il fait, il est précipité dans la mer à la vue du matelot, qui le laissa périr sans daigner seulement le regarder.",
 "Mais aussi, précisément parce que les hasards de ma navigation m'ont amené, le 21 mars, dans ces mers, mon point sera facile à relever, si, à midi, le soleil se montre à nos yeux.",
 "Depuis quelques jours il l'appelait «la veuve».",
 'Fabrizio wrote to the Conte that if ever the Prince had enough intelligence to perceive the mess into which the Ministers, Rassi, Fabio Conti, Zurla and others of like capacity had thrown his affairs, he, Fabrizio, would be the natural channel through which he would take action without unduly compromising his self-esteem.',
 "En ce moment, j'étais appuyé à l'avant, sur le bastingage de tribord.",
 '« Qu’est-ce qui est réglé ? se demandait la tante en courant a sa chambre pour se préparer au départ.',
 'Mme de Rênal crut sincèrement qu’elle allait devenir folle ; e

In [18]:
from transformers import pipeline

translator = pipeline("translation_en_to_fr", model="./my_awesome_opus_books_model/checkpoint-12500/")
output = translator(text_list_en)

In [19]:
preds = [item["translation_text"] for item in output]
preds

["L'honnête James, oubliant la blessure qu'il avait si récemment reçue de lui, flotta à son aide, et, avec beaucoup de difficulté, l'entraîna à nouveau, mais, ne résistant pas à la tentative, fut, par un coup soudain du navire, jeté au-dessus du navire, en vue de l'homme qu'il avait risqué sa vie pour sauver et qui ne l'avait pas le moindre avis de lui dans cette détresse.",
 "Mais précisément parce que la chance des marins m'a conduite dans ces mers le 21 mars, il sera facile d'obtenir notre appui si le soleil du midi apparaît devant nos yeux.",
 "Pendant quelques jours, il l'avait parlée comme la veuve.",
 'Fabrice wrote to the comte that if ever the prince had a sufficient spirit to see him in the manner in which the Ministers Rassi, Fabio Conti, Zurla and other forces had jet their affairs, he, Fabrice, would the canal natural by the way that he would make a démarche, without too much compromise his love-propre.',
 "Juste alors je étais à l'arrière, en s'inclinant sur le rail de l'

#### Without Pipeline

In [20]:
from transformers import AutoTokenizer
PATH = "./my_awesome_opus_books_model/checkpoint-12500/"
tokenizer = AutoTokenizer.from_pretrained(PATH)
inputs = tokenizer(text_list_en, return_tensors="pt").input_ids
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(PATH)
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [28]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"Les legumes partagent les ressources avec des bactéries d'azote fixateurs."

In [18]:
books["train"][0]

{'id': '8942',
 'translation': {'en': '"The lady who built the new part of this house as that tablet records, and whose son overlooks and directs everything here."',
  'fr': "-- C'est la dame qui a bâti la nouvelle partie de cette maison, ainsi que l'indique l'inscription. Son fils a maintenant la direction générale de l'école."}}

In [21]:
# evalute bleu score
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=preds, references=text_list_fr)
print(results)


{'bleu': 0.18854615193357083, 'precisions': [0.48928571428571427, 0.24354243542435425, 0.14122137404580154, 0.07509881422924901], 'brevity_penalty': 1.0, 'length_ratio': 1.1336032388663968, 'translation_length': 280, 'reference_length': 247}


In [22]:
def evaluate_bleu(dataset, translator):
    input = [trans["en"] for trans in dataset["translation"]]
    # text_list_en
    target = [trans["fr"] for trans in dataset["translation"]]
    output = translator(input)
    preds = [item["translation_text"] for item in output]
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=preds, references=target)
    return results
    

In [25]:
bleu = evaluate_bleu(books["train"][:100], translator)

In [26]:
print(bleu)

{'bleu': 0.20030559949561164, 'precisions': [0.5087719298245614, 0.26074011059123775, 0.14837849844513548, 0.08178438661710037], 'brevity_penalty': 1.0, 'length_ratio': 1.0478837109876016, 'translation_length': 2451, 'reference_length': 2339}
