# Fine-tune T5-small on Wikihow

## Libraries and environment preparation

In [1]:
#Install essential packages
%%capture
! pip install datasets transformers rouge-score nltk wandb
!apt install git-lfs

In [3]:
#Colab Environment Check for GPU and RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#GPU check
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 27.3 gigabytes of available RAM

Tue Feb  1 19:22:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [4]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:
import transformers
import datasets
print(transformers.__version__)

4.16.2


In [5]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

## Loading the dataset and process

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
raw_datasets = datasets.load_dataset("wikihow", "all", "/content/drive/MyDrive/dataset")

Downloading:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Using custom data configuration all-6f5101161f12f62f


Downloading and preparing dataset wikihow/all (download: 5.21 MiB, generated: 524.29 MiB, post-processed: Unknown size, total: 529.50 MiB) to /root/.cache/huggingface/datasets/wikihow/all-6f5101161f12f62f/1.2.0/5343fc81d685acaa086c9cc19eb8706206cd1f8b315792b04c1d7b92091c305e...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/75.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/75.2k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset wikihow downloaded and prepared to /root/.cache/huggingface/datasets/wikihow/all-6f5101161f12f62f/1.2.0/5343fc81d685acaa086c9cc19eb8706206cd1f8b315792b04c1d7b92091c305e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'headline', 'title'],
        num_rows: 157252
    })
    validation: Dataset({
        features: ['text', 'headline', 'title'],
        num_rows: 5599
    })
    test: Dataset({
        features: ['text', 'headline', 'title'],
        num_rows: 5577
    })
})

In [9]:
model_checkpoint = "t5-small"
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained("t5-small")

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [10]:
# If you are using one of the five T5 checkpoints we have to prefix 
# the inputs with "summarize:" (t5 is a multi-task model).

if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "

In [13]:
# tokenlize inputs into map

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["headline"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/158 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'headline', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 157252
    })
    validation: Dataset({
        features: ['text', 'headline', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5599
    })
    test: Dataset({
        features: ['text', 'headline', 'title', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5577
    })
})

## Fine-tuning the model

In [23]:
# Import Huggingface Automodel class from model checkpoint and print details

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
 

In [24]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
# keep track with wandb
wandb.init(project="T5-small")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,█▆▆▁▃▅▂▄▄
eval/loss,█▅▄▃▂▂▁▁▁
eval/rouge1,▁▄▅▆▇▇███
eval/rouge2,▁▄▅▆▇▇███
eval/rougeL,▁▄▅▆▇▇███
eval/rougeLsum,▁▄▅▆▇▇███
eval/runtime,██▆▇▂▁█▂▂
eval/samples_per_second,▁▁▃▂▇█▁▇▇
eval/steps_per_second,▁▁▃▂▆█▁▇▇
train/epoch,▁▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▆▇▇▇███

0,1
eval/gen_len,18.3054
eval/loss,2.68134
eval/rouge1,25.0062
eval/rouge2,8.6
eval/rougeL,21.0014
eval/rougeLsum,24.344
eval/runtime,178.7794
eval/samples_per_second,31.318
eval/steps_per_second,1.958
train/epoch,0.92


Define `Seq2SeqTrainer` to compute the metrics from the predictions, and also do a bit of pre-processing to decode the predictions into texts:

In [19]:
# Define compute_metrics
import nltk
import numpy as np
nltk.download('punkt')

metric = datasets.load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [26]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 16
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-wikihow",
    load_best_model_at_end="eval_loss",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    logging_steps=500,  # set to 1000 for full training
    save_steps=1000,  # set to 500 for full training
    eval_steps=1000,  # set to 8000 for full training
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

PyTorch: setting up devices


In [27]:
# Pass into the trainer

train_dataset=tokenized_datasets["train"]
eval_dataset=tokenized_datasets["validation"]

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


We can now finetune our model by just calling the `train` method:

In [28]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, headline, text.
***** Running training *****
  Num examples = 157252
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9829
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1000,2.9487,2.723632,23.9604,7.9177,20.2027,23.3441,18.3004
2000,2.8743,2.675549,25.1989,8.7274,21.2211,24.5621,18.3215
3000,2.8274,2.638959,25.1452,8.7519,21.2228,24.4862,18.4197
4000,2.8202,2.617266,25.6331,9.1006,21.6288,24.9828,18.4219
5000,2.7987,2.601493,25.7283,9.2046,21.779,25.0796,18.3977
6000,2.7572,2.583439,25.8228,9.3187,21.8999,25.1589,18.4476
7000,2.7768,2.574972,26.165,9.4873,22.1047,25.4985,18.4176
8000,2.7596,2.564936,26.0724,9.4176,21.9765,25.3998,18.5312
9000,2.7556,2.557888,26.0636,9.4029,22.0255,25.3854,18.4742


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, headline, text.
***** Running Evaluation *****
  Num examples = 5599
  Batch size = 16
Saving model checkpoint to t5-small-finetuned-wikihow/checkpoint-1000
Configuration saved in t5-small-finetuned-wikihow/checkpoint-1000/config.json
Model weights saved in t5-small-finetuned-wikihow/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-wikihow/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-wikihow/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [t5-small-finetuned-wikihow/checkpoint-7000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, headline, text.
***** Running Evaluation *****
  Num examples = 5599
  Batch size = 16


TrainOutput(global_step=9829, training_loss=2.818971254149074, metrics={'train_runtime': 5866.0072, 'train_samples_per_second': 26.807, 'train_steps_per_second': 1.676, 'total_flos': 2.128268860588032e+16, 'train_loss': 2.818971254149074, 'epoch': 1.0})

In [29]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▁▂▅▅▄▅▅█▆
eval/loss,█▆▄▄▃▂▂▁▁
eval/rouge1,▁▅▅▆▇▇███
eval/rouge2,▁▅▅▆▇▇███
eval/rougeL,▁▅▅▆▇▇███
eval/rougeLsum,▁▅▅▆▇▇███
eval/runtime,▇▇▁▆█▄▄▅▄
eval/samples_per_second,▂▂█▃▁▅▅▄▄
eval/steps_per_second,▂▂█▃▁▅▅▄▅
train/epoch,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇██

0,1
eval/gen_len,18.4742
eval/loss,2.55789
eval/rouge1,26.0636
eval/rouge2,9.4029
eval/rougeL,22.0255
eval/rougeLsum,25.3854
eval/runtime,178.0592
eval/samples_per_second,31.445
eval/steps_per_second,1.966
train/epoch,1.0


In [42]:
!ls t5-small-finetuned-wikihow

checkpoint-7000  checkpoint-8000  checkpoint-9000


In [43]:
!zip -r /content/t5-small-finetuned-wikihow.zip /content/t5-small-finetuned-wikihow/checkpoint-9000/

  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/ (stored 0%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/pytorch_model.bin (deflated 8%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/scaler.pt (deflated 55%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/tokenizer.json (deflated 74%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/training_args.bin (deflated 49%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/config.json (deflated 62%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/optimizer.pt (deflated 7%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/special_tokens_map.json (deflated 83%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/rng_state.pth (deflated 27%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/trainer_state.json (deflated 78%)
  adding: content/t5-small-finetuned-wikihow/checkpoint-9000/tokenizer_config.json (deflated 80%)
  a

In [45]:
!cp t5-small-finetuned-wikihow.zip '/content/drive/My Drive/weights/'

## Trying with a smaller dataset

In [30]:
# Init new logging params
wandb.init(project="T5-small")

In [31]:
# Select to get smaller dataset
small_train = raw_datasets['train'].select(list(range(0, 10000)))
small_val = raw_datasets['validation'].select(list(range(0, 1000)))
small_train

Dataset({
    features: ['text', 'headline', 'title'],
    num_rows: 10000
})

In [32]:
tokenized_train = small_train.map(preprocess_function, batched=True)
tokenized_val = small_val.map(preprocess_function, batched=True)
tokenized_train

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'headline', 'title', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [33]:
# Import a new T5-small
model_small = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
 

In [34]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_small)

In [35]:
# Define traing args, batch size and epoch
# batch size max 16 on Colab Pro

batch_size = 16
epochs = 16
model_name = model_checkpoint.split("/")[-1]
args_small = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-cnn-small",
    load_best_model_at_end="eval_loss",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=1000,  # set to 1000 for full training
    save_steps=1250,  # set to 500 for full training
    eval_steps=1250,  # set to 8000 for full training
    save_total_limit=3,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)


PyTorch: setting up devices


In [36]:
# Pass into the trainer

train_dataset=tokenized_train
eval_dataset=tokenized_val

trainer_small = Seq2SeqTrainer(
    model_small,
    args_small,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [37]:
trainer_small.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, headline, text.
***** Running training *****
  Num examples = 10000
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1250,2.9311,2.727167,24.4153,8.2277,20.3017,23.7679,18.282
2500,2.7277,2.710439,24.9279,8.429,20.7324,24.2383,18.308
3750,2.6009,2.715899,24.6432,8.4963,20.4836,23.9888,18.261
5000,2.4361,2.716969,25.1444,8.7085,20.9137,24.4743,18.43
6250,2.3684,2.743895,25.1857,8.6989,20.7622,24.5202,18.349
7500,2.3219,2.754333,25.5251,8.9533,21.102,24.7775,18.402
8750,2.2876,2.760252,25.4397,8.9315,20.9988,24.7764,18.449
10000,2.2335,2.76846,25.5189,8.9084,21.0233,24.8291,18.441


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, headline, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to t5-small-finetuned-cnn-small/checkpoint-1250
Configuration saved in t5-small-finetuned-cnn-small/checkpoint-1250/config.json
Model weights saved in t5-small-finetuned-cnn-small/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-cnn-small/checkpoint-1250/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-cnn-small/checkpoint-1250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: title, headline, text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to t5-small-finetuned-cnn-small/checkpoint-2500
Configuration save

TrainOutput(global_step=10000, training_loss=2.4666728515625, metrics={'train_runtime': 4585.0343, 'train_samples_per_second': 34.896, 'train_steps_per_second': 2.181, 'total_flos': 2.165468823552e+16, 'train_loss': 2.4666728515625, 'epoch': 16.0})

In [38]:
 wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▂▃▁▇▄▆██
eval/loss,▃▁▂▂▅▆▇█
eval/rouge1,▁▄▂▆▆█▇█
eval/rouge2,▁▃▄▆▆███
eval/rougeL,▁▅▃▆▅█▇▇
eval/rougeLsum,▁▄▂▆▆███
eval/runtime,███▁▂▂▃▃
eval/samples_per_second,▁▁▁█▇▇▆▆
eval/steps_per_second,▁▁▁█▇▇▆▆
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇███

0,1
eval/gen_len,18.441
eval/loss,2.76846
eval/rouge1,25.5189
eval/rouge2,8.9084
eval/rougeL,21.0233
eval/rougeLsum,24.8291
eval/runtime,31.8939
eval/samples_per_second,31.354
eval/steps_per_second,1.975
train/epoch,16.0


In [47]:
!ls t5-small-finetuned-cnn-small

checkpoint-10000  checkpoint-2500  checkpoint-8750


In [48]:
!zip -r /content/t5-small-finetuned-wikihow-small.zip /content/!ls t5-small-finetuned-cnn-small/checkpoint-2500/

  adding: t5-small-finetuned-cnn-small/checkpoint-2500/ (stored 0%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/pytorch_model.bin (deflated 8%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/scaler.pt (deflated 55%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/tokenizer.json (deflated 74%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/training_args.bin (deflated 49%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/config.json (deflated 62%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/optimizer.pt (deflated 7%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/special_tokens_map.json (deflated 83%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/rng_state.pth (deflated 27%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/trainer_state.json (deflated 64%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/tokenizer_config.json (deflated 80%)
  adding: t5-small-finetuned-cnn-small/checkpoint-2500/scheduler.pt (

In [49]:
!cp t5-small-finetuned-wikihow-small.zip '/content/drive/My Drive/weights/'

In [50]:
from transformers import T5ForConditionalGeneration

In [51]:
num_start = 20
num_select = 10

In [52]:
small_test = raw_datasets['test'].select(list(range(num_start, num_start+num_select)))
small_test

Dataset({
    features: ['text', 'headline', 'title'],
    num_rows: 10
})

In [54]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error

sentences = [prefix + sentence for sentence in small_test['text']] # use different length sentences to test batching
inputs = tokenizer([prefix + sentence for sentence in sentences], max_length=max_input_length, return_tensors="pt", padding=True)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


In [55]:
output_sequences = model.generate(
    input_ids=inputs['input_ids'].cuda(),
    attention_mask=inputs['attention_mask'].cuda(),
    do_sample=False, # disable sampling to test if batching affects output
)
prediction = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

In [56]:
output_sequences_small = model_small.generate(
    input_ids=inputs['input_ids'].cuda(),
    attention_mask=inputs['attention_mask'].cuda(),
    do_sample=False, # disable sampling to test if batching affects output
)
prediction_small = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

In [57]:
for i in range(num_select):
    print("Original Text: %s" % small_test[i]['text'])
    print("\nActual Summary: %s" % small_test[i]['headline'])
    print("\nBatch Predicted: %s" % prediction[i])
    print("\nSmall_Set Summary: %s" % prediction_small[i])
    print("=====================================================================\n")

Original Text: In general, dwarf hamsters have thickset bodies, large cheek pouches, and short tails. Make sure it does not have any nasal or eye discharge, nor any other signs of illness. Russian dwarf hamsters are particularly prone to diabetes. A hamster suffering from this condition will drink a lot of water and urinate more frequently than other dwarf hamsters.Be sure to consult your veterinarian if you suspect that your hamster has diabetes.
 A recently weaned or an extremely stressed out dwarf hamster may come down with a disease called “wet tail.” Your hamster experiences diarrhea — the excessive moisture from this causes its tail to become literally wet. Consult a veterinarian for a proper diagnosis and treatment.


Tyzzer’s disease causes diarrhea in young or stressed hamsters. This is a disease that needs veterinarian treatment. Certain antibiotics can cause and exacerbate this condition, so don’t treat your hamster on your own., Like dogs and cats, dwarf hamsters can suffer