<a href="https://colab.research.google.com/github/sirmammingtonham/topics_nlp/blob/main/seq2seq_reverse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers evaluate datasets sacrebleu accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.15.0-py3-none-any.whl (191 kB)
[2K  

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
import evaluate
from datasets import load_dataset
import numpy as np

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
raw_ds = load_dataset("ag_news", split='train[:50%]')

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


In [5]:
def preprocess(example):
    output = tokenizer(example["text"], max_length=256, truncation=True)
    output["labels"] = [ids[:-1][::-1] + [tokenizer.eos_token_id] for ids in output["input_ids"]]
    return output


In [6]:
tokenized_ds = raw_ds.map(preprocess, batched=True, remove_columns=raw_ds.column_names)
tokenized_ds.set_format("torch")
tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)
train_ds = tokenized_ds["train"]
valid_test = tokenized_ds["test"].train_test_split(test_size=0.5)
valid_ds = valid_test["train"]
test_ds = valid_test["test"]


  0%|          | 0/60 [00:00<?, ?ba/s]

In [7]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [9]:
trainer = Seq2SeqTrainer(
        model=model,
        args=Seq2SeqTrainingArguments(
            output_dir="/content/output",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=16, 
            per_device_eval_batch_size=16, 
            predict_with_generate=True, 
            fp16=True,
        ),
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

Using cuda_amp half precision backend


In [10]:
train_result = trainer.train()
trainer.save_model(output_dir='out')

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** Running training *****
  Num examples = 48000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9000
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.3156,0.034379,14.8805,18.9962
2,0.1339,0.012279,14.9775,18.9992
3,0.1089,0.00958,14.983,18.9992


***** Running Evaluation *****
  Num examples = 6000
  Batch size = 16
Saving model checkpoint to /content/output/checkpoint-3000
Configuration saved in /content/output/checkpoint-3000/config.json
Model weights saved in /content/output/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in /content/output/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /content/output/checkpoint-3000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6000
  Batch size = 16
Saving model checkpoint to /content/output/checkpoint-6000
Configuration saved in /content/output/checkpoint-6000/config.json
Model weights saved in /content/output/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in /content/output/checkpoint-6000/tokenizer_config.json
Special tokens file saved in /content/output/checkpoint-6000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 6000
  Batch size = 16
Saving model checkpoint to /content/output/checkp

***** train metrics *****
  epoch                    =        3.0
  total_flos               =  3920802GF
  train_loss               =     0.6389
  train_runtime            = 0:25:25.18
  train_samples_per_second =     94.415
  train_steps_per_second   =      5.901


In [11]:
predict_results = trainer.predict(test_ds, metric_key_prefix="predict", max_length=256)
metrics = predict_results.metrics

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

***** Running Prediction *****
  Num examples = 6000
  Batch size = 16


***** predict metrics *****
  predict_bleu               =    98.8354
  predict_gen_len            =    59.8638
  predict_loss               =     0.0096
  predict_runtime            = 0:06:17.41
  predict_samples_per_second =     15.898
  predict_steps_per_second   =      0.994


In [12]:
predictions = tokenizer.batch_decode(
	predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
predictions = [pred.strip() for pred in predictions]
print(predictions[:10])
with open('generated.txt', "w") as writer:
	writer.write("\n".join(predictions))

['. Open Texas the of point halfway the at lead the into move to Friday on 65under- fivea  shot Wilson Dean- ) NetworksSport (X T, Antonio San Open Texas at top on Wilson', '. final Olympic the complete to failure her forsbin Robally Smate- teamising critic publicly after actiondisciplinary  face eighting rows;39 # women Australian the of Members eighting row Australian insction Ru', '. rain acid ford blame emissions reduce to plants power ordered Tuesday on officials York New, emergency health publica  called they whating Cit- AP )AP (smission E Cut tos Plant Powers Order.Y. N', 'his meet will Meyer Urban Coach Utah, afternoon same the In. today chapters fresh of start the launch each will coach fired its and Florida,s beginning new of daya During  going, coming coachesUF', '. Thursday on Open Haven New5,00058$  the ofsfinal- quarter the in France ofchy Delieha Nat seed eighth to-5 76-4  lost she when blow bodya  dealt were Open USs;39 # week next fors preparations;39 #iatpri Ca Jenni