In [106]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
import torch
import evaluate
from datasets import load_dataset
import numpy as np


In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading: 100%|██████████| 1.21k/1.21k [00:00<00:00, 939kB/s]
Downloading: 100%|██████████| 242M/242M [00:13<00:00, 17.8MB/s]
Downloading: 100%|██████████| 792k/792k [00:00<00:00, 3.09MB/s]
Downloading: 100%|██████████| 1.39M/1.39M [00:00<00:00, 4.10MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [64]:
raw_ds = load_dataset("cc_news", split='train[:5%]')

Found cached dataset cc_news (/Users/yeeb/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6)


In [101]:
def preprocess(example):
    text = example["text"]
    reverse_text = [" ".join((x.split()[::-1])) for x in text]
    output = tokenizer(example["text"], padding="max_length", truncation=True)
    labels = tokenizer(text_target=reverse_text, padding="max_length", truncation=True)
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]
    output["labels"] = labels["input_ids"]
    return output


In [102]:
tokenized_ds = raw_ds.map(preprocess, batched=True, remove_columns=raw_ds.column_names)
tokenized_ds.set_format("torch")
tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)
train_ds = tokenized_ds["train"]
valid_test = tokenized_ds["test"].train_test_split(test_size=0.5)
valid_ds = valid_test["train"]
test_ds = valid_test["test"]


100%|██████████| 8/8 [00:23<00:00,  2.92s/ba]


In [108]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [109]:
trainer = Seq2SeqTrainer(
        model=model,
        # args=Seq2SeqTrainingArguments(),
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
    )

In [111]:
train_result = trainer.train()
trainer.save_model(output_dir='out')

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** Running training *****
  Num examples = 6373
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2391
  Number of trainable parameters = 60506624
  0%|          | 0/2391 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
metrics = trainer.evaluate(max_length=512, metric_key_prefix="eval")

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
predict_results = trainer.predict(test_ds, metric_key_prefix="predict", max_length=512)
metrics = predict_results.metrics

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

In [None]:
predictions = tokenizer.batch_decode(
	predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
predictions = [pred.strip() for pred in predictions]
print(predictions)