This notebooks is based on the script [run_summarization_no_trainer.py](https://github.com/huggingface/transformers/blob/v4.12.5/examples/pytorch/summarization/run_summarization_no_trainer.py) from HuggingFace.

In [1]:
import datasets
from datasets import load_dataset, load_metric
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader

In [2]:
tokenizer = AutoTokenizer.from_pretrained('t5-small', use_fast=True,
                                          cache_dir='./cache/tf-small_tokenizer')

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    't5-small',
    cache_dir='./cache/t5-small_model'
)

In [7]:
raw_datasets = load_dataset('xsum')

Using custom data configuration default
Reusing dataset xsum (/users/sarafael/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
max_source_length = 1024
max_target_length = 128
ignore_pad_token_for_loss = True
padding = False            # else 'max_length'
label_pad_token_id = -100  # else tokenizer.pad_token_id
per_device_train_batch_size = 4
per_device_eval_batch_size = 4

In [5]:
def preprocess_function(examples, text_column='document',
                        summary_column='summary', prefix='summarize: '):
    inputs = examples[text_column]
    targets = examples[summary_column]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=max_source_length,
                             padding=padding, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length,
                           padding=padding, truncation=True)

    # padding in the loss.
    if padding == "max_length" and ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100)
             for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id
)

In [12]:
processed_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    # load_from_cache_file=not args.overwrite_cache,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/205 [00:00<?, ?ba/s]

Loading cached processed dataset at /users/sarafael/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-538edee0d0d03148.arrow
Loading cached processed dataset at /users/sarafael/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499/cache-03e5ccf5334d760b.arrow


In [13]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

In [14]:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=per_device_train_batch_size
)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=data_collator,
    batch_size=per_device_eval_batch_size
)

In [15]:
weight_decay = 0.0
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters()
                   if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters()
                   if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]

In [16]:
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)

In [17]:
device = 0
model.to(device)
model.train();

In [None]:
for step, batch in enumerate(train_dataloader):
    # if step > 5:
    #     break
    
    optimizer.zero_grad()
    outputs = model(**batch.to(device))
    loss = outputs.loss
    loss.backward()
    optimizer.step()

In [None]:
for epoch in range(1):
    for i, batch in tqdm(enumerate(train_loader)):
        optim.zero_grad()
        outputs = model(input_ids=batch[0].to(device),
              token_type_ids=batch[1].to(device),
              attention_mask=batch[2].to(device),
              start_positions=batch[3].to(device),
              end_positions=batch[4].to(device)
             )
        
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        #if i > 10:
        #    break

In [None]:
for epoch in range(args.num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / args.gradient_accumulation_steps
        accelerator.backward(loss)
        if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1