## Preprocess data

In [1]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
import pytorch_lightning as pl
from tqdm import tnrange

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
dataset = load_dataset("xsum")

use_percent = 1
dataset_train = load_dataset("xsum", split=f"train[:{use_percent}%]")
dataset_val = load_dataset("xsum", split=f"validation[:{use_percent}%]")
dataset_test = load_dataset("xsum", split=f"test[:{use_percent}%]")
dataset = DatasetDict({'train': dataset_train, 'validation': dataset_val, 'test': dataset_test})

print(dataset)

Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)
Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)
Found cached dataset xsum (/home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 2040
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 113
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 113
    })
})


In [3]:
example = dataset['train'][0]

print("text:", example["document"])
print("Summary:", example["summary"])
print("id:", example["id"])

text: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate that - but it

In [4]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

prefix = "summarize: "
max_input_length = 512
max_target_length = 128

def preprocessor(examples, prefix='summarize:', max_input_length=512, max_target_length=128):
  # encode the code-docstring pairs
  texts = examples['document']
  summaries = examples['summary']
  
  inputs = [prefix + text for text in texts]
  model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

  # encode the summaries
  labels = tokenizer(summaries, max_length=max_target_length, padding="max_length", truncation=True).input_ids

  # important: we need to replace the index of the padding tokens by -100
  # such that they are not taken into account by the CrossEntropyLoss
  labels_with_ignore_index = []
  for labels_example in labels:
    labels_example = [label if label != 0 else -100 for label in labels_example]
    labels_with_ignore_index.append(labels_example)
  
  model_inputs["labels"] = labels_with_ignore_index

  return model_inputs

In [5]:
dataset = dataset.map(preprocessor, batched=True)
print(dataset)

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71/cache-6c2509cd242fa181.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71/cache-cc4e71638378e916.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71/cache-493f47d1d77c8755.arrow


DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2040
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 113
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 113
    })
})


In [6]:
dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2040
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 113
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 113
    })
})


In [7]:
num_workers = os.cpu_count()
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8, num_workers=num_workers)
valid_dataloader = DataLoader(dataset['validation'], batch_size=4, num_workers=num_workers)
test_dataloader = DataLoader(dataset['test'], batch_size=4, num_workers=num_workers)

In [8]:
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


Let's verify an example, by decoding it back into text:

In [9]:
tokenizer.decode(batch['input_ids'][0])

"summarize:Media playback is not supported on this device Evans went close for Chesterfield early on before the visitors went ahead midway through the first half when Jay O'Shea calmly slotted in from close range from Kristian Dennis' threaded pass. Sam Walker pushed away O'Shea's low effort soon after, but Kurtis Guthrie's diving header flashed inches wide and Drey Wright was denied by keeper Ryan Fulton. Colchester needed just 53 seconds to equalise after the interval when substitute Tarique Fosu netted with his first touch after coming on with a low finish, after the ball had broken to him in the area. Colchester captain Luke Prosser's header from Wright's corner then rattled the bar. But Chesterfield restored their advantage six minutes into the second half through Evans, who emphatically volleyed home after Tom Anderson had nodded on a free-kick in the area. Guthrie was later denied by Fulton but Chesterfield claimed a narrow victory. Report supplied by the Press Association. Matc

In [10]:
labels = batch['labels'][0]
tokenizer.decode([label for label in labels if label != -100])

'Ched Evans grabbed the winner as Chesterfield ended their long run without a victory by triumphing at Colchester United in the FA Cup first round.</s>'

## Fine-tune using PyTorch Lightning


In [7]:
class T5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.model.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}
    
    def generate(self, input_ids):
        return self.model.generate(input_ids)

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

We can now simply initialize the model and start training on Colab's GPU.

In [12]:
# Hyperparameters
lr = 5e-5
num_train_epochs = 15
warmup_steps = 1000
patience = 3
max_epochs = 1

# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=patience,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')
checkpoint_callback = ModelCheckpoint(dirpath='./saved/checkpoints/', monitor='validation_loss', mode='min', save_top_k = 1)

accelerator = "gpu" if torch.cuda.is_available() else "cpu"
trainer = Trainer(accelerator=accelerator, 
                  callbacks=[early_stop_callback, lr_monitor, checkpoint_callback], max_epochs=max_epochs)

model = T5(lr=lr, num_train_epochs=num_train_epochs, warmup_steps=warmup_steps)
trainer.fit(model)
tokenizer.save_pretrained("./saved/tokenizers/")
checkpoint_path = checkpoint_callback.best_model_path

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


## Inference

In [8]:
test_dataset = dataset['test']
test_document = test_dataset['document'][0]
test_input_ids = test_dataset['input_ids'][0]
test_gt_summary = test_dataset['summary'][0]
test_attention_mask = test_dataset['attention_mask'][0]
del test_dataset, dataset

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer_save_directory="./saved/tokenizers/"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_directory)
checkpoint_path = './saved/saved_checkpoints/epoch=4-step=6380.ckpt'
new_model = T5.load_from_checkpoint(checkpoint_path).to(device)
new_model.eval()

T5(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=2048, ou

In [10]:
print("Input document:")
print(test_document)

Input document:
Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.
Andrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the nee

In [11]:
print("Ground truth summary:")
print(test_gt_summary)

Ground truth summary:
There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.


In [12]:
input_ids_tensor = test_input_ids.clone().detach().reshape((1,-1))
input_ids_tensor = input_ids_tensor.to(device)
output_ids = new_model.generate(input_ids_tensor)
generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [13]:
print("Generated summary:")
print(generated_summary)

Generated summary:
Prison leavers are being treated as "critically" as they are being treated as "


Inference using forward pass

In [25]:
length=20
document = test_document
label_tensor = torch.tensor(tokenizer.pad_token_id).reshape((1, -1)).to(device)
attention_mask = test_attention_mask.clone().detach().reshape((1, -1)).to(device)
for i in range(length):
    outputs=new_model(input_ids=input_ids_tensor[:, -length:], attention_mask=attention_mask[:, -500:], labels=label_tensor)
    # Get the logits
    logits = outputs.logits

    # Get the token ids of the most likely tokens
    token_ids = torch.argmax(logits, dim=-1)

    # Decode the token ids to get the text
    #document = tokenizer.decode(token_ids[0], skip_special_tokens=True)
    # input_ids_tensor = torch.cat((input_ids_tensor, token_ids), dim=1)
    label_tensor = torch.cat((label_tensor, token_ids), dim=1).reshape((1, -1))
    attention_mask = torch.tensor(tokenizer.encode_plus(document)['attention_mask']).reshape((1,-1)).to(device)

Token indices sequence length is longer than the specified maximum sequence length for this model (775 > 512). Running this sequence through the model will result in indexing errors


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 14.76 GiB total capacity; 13.74 GiB already allocated; 29.75 MiB free; 14.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [88]:
document_tokenized = tokenizer.encode_plus(document)
document

'Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.\nWorkers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.\nThe Welsh Government said more people than ever were getting help to address housing problems.\nChanges to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.\nPrison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.\nHowever, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.\nAndrew Stevens, who works in Welsh prisons trying to secure housing for prison leavers, said the need for acc

In [76]:
summary= input_ids_tensor[0][-length:]
# Decode the token ids to get the text
document_decoded = [tokenizer.decode(token_id, skip_special_tokens=True) for token_id in summary]
print(" ".join(document_decoded))

Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison Prison
