<a href="https://colab.research.google.com/github/sirmammingtonham/topics_nlp/blob/main/seq2seq_reverse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers evaluate datasets sacrebleu accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.15.0-py3-none-any.whl (191 kB)
[2K  

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
import evaluate
from datasets import load_dataset
import numpy as np

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
raw_ds = load_dataset("ag_news")#, split='train[:5%]')

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
def preprocess(example):
    text = example["text"]
    reverse_text = [" ".join((x.split()[::-1])) for x in text]
    output = tokenizer(example["text"], max_length=256, truncation=True)
    labels = tokenizer(text_target=reverse_text, max_length=256, truncation=True)
    output["labels"] = labels["input_ids"]
    return output


In [14]:
tokenized_ds = raw_ds.map(preprocess, batched=True, remove_columns=raw_ds['train'].column_names)
tokenized_ds.set_format("torch")
# tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)
train_ds = tokenized_ds["train"]
valid_test = tokenized_ds["test"].train_test_split(test_size=0.5)
valid_ds = valid_test["train"]
test_ds = valid_test["test"]


  0%|          | 0/120 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [15]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [17]:
trainer = Seq2SeqTrainer(
        model=model,
        args=Seq2SeqTrainingArguments(
            output_dir="/content/output",
            evaluation_strategy = "epoch",
            save_total_limit=2,
            learning_rate=2e-5,
            per_device_train_batch_size=16, 
            per_device_eval_batch_size=16, 
            predict_with_generate=True, 
            fp16=True,
        ),
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

Using cuda_amp half precision backend


In [18]:
train_result = trainer.train()
trainer.save_model(output_dir='out')

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** Running training *****
  Num examples = 120000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 22500
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.0724,0.008684,16.0866,18.9984
2,0.0395,0.004678,16.0949,19.0
3,0.0293,0.004041,16.0954,19.0


Saving model checkpoint to /content/output/checkpoint-500
Configuration saved in /content/output/checkpoint-500/config.json
Model weights saved in /content/output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/output/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/output/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/output/checkpoint-1000
Configuration saved in /content/output/checkpoint-1000/config.json
Model weights saved in /content/output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/output/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to /content/output/checkpoint-1500
Configuration saved in /content/output/checkpoint-1500/config.json
Model weights saved in /content/output/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in /content/output/checkpoint-1500/token

***** train metrics *****
  epoch                    =        3.0
  total_flos               =  9738483GF
  train_loss               =     0.1949
  train_runtime            = 0:57:35.02
  train_samples_per_second =    104.196
  train_steps_per_second   =      6.512


In [19]:
predict_results = trainer.predict(test_ds, metric_key_prefix="predict", max_length=256)
metrics = predict_results.metrics

trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)

***** Running Prediction *****
  Num examples = 3800
  Batch size = 16


***** predict metrics *****
  predict_bleu               =     99.089
  predict_gen_len            =    58.6937
  predict_loss               =     0.0045
  predict_runtime            = 0:04:24.74
  predict_samples_per_second =     14.353
  predict_steps_per_second   =      0.899


In [20]:
predictions = tokenizer.batch_decode(
	predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
predictions = [pred.strip() for pred in predictions]
print(predictions[:10])
with open('generated.txt', "w") as writer:
	writer.write("\n".join(predictions))

