In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import BartTokenizer
import evaluate
import numpy as np


In [2]:
checkpoint = "facebook/bart-base"

In [3]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

In [4]:
books["train"][0]

{'id': '95416',
 'translation': {'en': '"Are they good to eat?" asked Pencroft.',
  'fr': '-- Et cela se mange? demanda Pencroff.'}}

In [5]:
tokenizer = BartTokenizer.from_pretrained(checkpoint)

In [6]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [7]:
tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [9]:
metric = evaluate.load("sacrebleu")

In [10]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

### Train

In [11]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)



model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="bart",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,                                                                                                                                                                                                                         
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
    tokenizer=tokenizer,
    data_collator=data_collator,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
    compute_metrics=compute_metrics,
)

trainer.train()

In [13]:
import torch
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 Ti'

### Inference

In [14]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
text_list = [
    "translate English to French: Legumes share resources with nitrogen-fixing bacteria.",
    "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
]

In [15]:
text_list_en = [trans["en"] for trans in books["train"][1:10]["translation"]]
# text_list_en
text_list_fr = [trans["fr"] for trans in books["train"][1:10]["translation"]]
text_list_fr

["Le bon Jacques court à son secours, l'aide à remonter, et de l'effort qu'il fait, il est précipité dans la mer à la vue du matelot, qui le laissa périr sans daigner seulement le regarder.",
 "Mais aussi, précisément parce que les hasards de ma navigation m'ont amené, le 21 mars, dans ces mers, mon point sera facile à relever, si, à midi, le soleil se montre à nos yeux.",
 "Depuis quelques jours il l'appelait «la veuve».",
 'Fabrizio wrote to the Conte that if ever the Prince had enough intelligence to perceive the mess into which the Ministers, Rassi, Fabio Conti, Zurla and others of like capacity had thrown his affairs, he, Fabrizio, would be the natural channel through which he would take action without unduly compromising his self-esteem.',
 "En ce moment, j'étais appuyé à l'avant, sur le bastingage de tribord.",
 '« Qu’est-ce qui est réglé ? se demandait la tante en courant a sa chambre pour se préparer au départ.',
 'Mme de Rênal crut sincèrement qu’elle allait devenir folle ; e

In [18]:
from transformers import pipeline

translator = pipeline("translation_en_to_fr", model="./my_awesome_opus_books_model/checkpoint-12500/")
output = translator(text_list_en)

In [19]:
preds = [item["translation_text"] for item in output]
preds

["L'honnête James, oubliant la blessure qu'il avait si récemment reçue de lui, flotta à son aide, et, avec beaucoup de difficulté, l'entraîna à nouveau, mais, ne résistant pas à la tentative, fut, par un coup soudain du navire, jeté au-dessus du navire, en vue de l'homme qu'il avait risqué sa vie pour sauver et qui ne l'avait pas le moindre avis de lui dans cette détresse.",
 "Mais précisément parce que la chance des marins m'a conduite dans ces mers le 21 mars, il sera facile d'obtenir notre appui si le soleil du midi apparaît devant nos yeux.",
 "Pendant quelques jours, il l'avait parlée comme la veuve.",
 'Fabrice wrote to the comte that if ever the prince had a sufficient spirit to see him in the manner in which the Ministers Rassi, Fabio Conti, Zurla and other forces had jet their affairs, he, Fabrice, would the canal natural by the way that he would make a démarche, without too much compromise his love-propre.',
 "Juste alors je étais à l'arrière, en s'inclinant sur le rail de l'

#### Without Pipeline

In [20]:
from transformers import AutoTokenizer
PATH = "./my_awesome_opus_books_model/checkpoint-12500/"
tokenizer = AutoTokenizer.from_pretrained(PATH)
inputs = tokenizer(text_list_en, return_tensors="pt").input_ids
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(PATH)
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [28]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"Les legumes partagent les ressources avec des bactéries d'azote fixateurs."

In [18]:
books["train"][0]

{'id': '8942',
 'translation': {'en': '"The lady who built the new part of this house as that tablet records, and whose son overlooks and directs everything here."',
  'fr': "-- C'est la dame qui a bâti la nouvelle partie de cette maison, ainsi que l'indique l'inscription. Son fils a maintenant la direction générale de l'école."}}

In [21]:
# evalute bleu score
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=preds, references=text_list_fr)
print(results)


{'bleu': 0.18854615193357083, 'precisions': [0.48928571428571427, 0.24354243542435425, 0.14122137404580154, 0.07509881422924901], 'brevity_penalty': 1.0, 'length_ratio': 1.1336032388663968, 'translation_length': 280, 'reference_length': 247}


In [22]:
def evaluate_bleu(dataset, translator):
    input = [trans["en"] for trans in dataset["translation"]]
    # text_list_en
    target = [trans["fr"] for trans in dataset["translation"]]
    output = translator(input)
    preds = [item["translation_text"] for item in output]
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=preds, references=target)
    return results
    

In [25]:
bleu = evaluate_bleu(books["train"][:100], translator)

In [26]:
print(bleu)

{'bleu': 0.20030559949561164, 'precisions': [0.5087719298245614, 0.26074011059123775, 0.14837849844513548, 0.08178438661710037], 'brevity_penalty': 1.0, 'length_ratio': 1.0478837109876016, 'translation_length': 2451, 'reference_length': 2339}
