# Fine-tune Bart on x-sum

## Libraries and environment preparation

In [None]:
#Install essential packages
%%capture
!pip install datasets transformers rouge-score nltk wandb
!apt install git-lfs

In [None]:
#Colab Environment Check for GPU and RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#GPU check
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 13.6 gigabytes of available RAM

Sun Feb  6 19:08:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [None]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:
import datasets
import transformers
print(transformers.__version__)

4.16.2


In [None]:
from transformers import AutoTokenizer    
# Huggingface Automodel class
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "facebook/bart-base"

In [None]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

## Loading the dataset

In [None]:
# import dataset
raw_datasets = datasets.load_dataset("xsum")

Downloading:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/954 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

## Preprocessing the data

In [None]:
# Import tokenizer from model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# tokenlize inputs into map

max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/205 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11334
    })
})

## Fine-tuning the model

In [None]:
# Import tokenizer from model checkpoint and print detail
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

In [None]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# keep track with wandb
wandb.init(project="BART-sum")

[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


Define `Seq2SeqTrainer` to compute the metrics from the predictions, and also do a bit of pre-processing to decode the predictions into texts:

In [None]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 4
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    load_best_model_at_end="eval_loss",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=2500, 
    save_steps=5000,
    eval_steps=5000,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

In [None]:
import nltk
import numpy as np
nltk.download('punkt')

metric = datasets.load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
# Pass into the trainer

train_dataset=tokenized_datasets["train"]
eval_dataset=tokenized_datasets["validation"].select(range(5666))

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


We can now finetune our model by just calling the `train` method:

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: id, document, summary.
***** Running training *****
  Num examples = 204045
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 51012
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
5000,2.3974,2.105726,33.7969,12.8701,27.2922,27.274,19.503
10000,2.3078,2.052178,34.3585,13.2561,27.913,27.9134,19.6929
15000,2.2408,2.007942,34.5408,13.6382,28.3129,28.3147,19.5042
20000,2.1857,1.967978,35.2728,14.3458,28.9145,28.9102,19.5814
25000,2.1376,1.93012,36.151,15.0648,29.7514,29.7511,19.395
30000,2.1096,1.904812,36.5037,15.5105,30.0775,30.0771,19.5478
35000,2.056,1.876446,36.8526,15.721,30.2841,30.2778,19.5143
40000,2.0377,1.845096,37.1181,15.9993,30.6514,30.6443,19.5157
45000,2.0047,1.824728,37.4224,16.2333,30.9516,30.9443,19.4905
50000,1.9908,1.808068,37.727,16.4899,31.1554,31.1408,19.5307


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: id, document, summary.
***** Running Evaluation *****
  Num examples = 5666
  Batch size = 4
Saving model checkpoint to bart-base-finetuned-xsum/checkpoint-5000
Configuration saved in bart-base-finetuned-xsum/checkpoint-5000/config.json
Model weights saved in bart-base-finetuned-xsum/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in bart-base-finetuned-xsum/checkpoint-5000/tokenizer_config.json
Special tokens file saved in bart-base-finetuned-xsum/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: id, document, summary.
***** Running Evaluation *****
  Num examples = 5666
  Batch size = 4
Saving model checkpoint to bart-base-finetuned-xsum/checkpoint-10000
Configuration saved in bart-base-finetu

TrainOutput(global_step=51012, training_loss=2.1599176275695546, metrics={'train_runtime': 21857.5468, 'train_samples_per_second': 9.335, 'train_steps_per_second': 2.334, 'total_flos': 9.41651411710464e+16, 'train_loss': 2.1599176275695546, 'epoch': 1.0})

In [None]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▄█▄▅▁▅▄▄▃▄
eval/loss,█▇▆▅▄▃▃▂▁▁
eval/rouge1,▁▂▂▄▅▆▆▇▇█
eval/rouge2,▁▂▂▄▅▆▇▇██
eval/rougeL,▁▂▃▄▅▆▆▇██
eval/rougeLsum,▁▂▃▄▅▆▆▇██
eval/runtime,▂▃▃▁▃▂▄█▆▇
eval/samples_per_second,▇▆▆█▆▇▅▁▃▂
eval/steps_per_second,▇▆▆█▆▇▅▁▃▂
train/epoch,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇███

0,1
eval/gen_len,19.5307
eval/loss,1.80807
eval/rouge1,37.727
eval/rouge2,16.4899
eval/rougeL,31.1554
eval/rougeLsum,31.1408
eval/runtime,729.9036
eval/samples_per_second,7.763
eval/steps_per_second,1.941
train/epoch,1.0


In [None]:
!ls -lh

total 1.5G
drwxr-xr-x 5 root root 4.0K Feb  7 01:15 bart-base-finetuned-xsum
-rw-r--r-- 1 root root 1.5G Feb  7 01:26 bart-base-finetuned-xsum.zip
drwx------ 5 root root 4.0K Feb  7 01:27 drive
drwxr-xr-x 1 root root 4.0K Feb  1 14:32 sample_data
drwxr-xr-x 3 root root 4.0K Feb  6 19:15 wandb


In [None]:
!zip -r /content/bart-base-finetuned-xsum.zip /content/bart-base-finetuned-xsum/checkpoint-50000/

  adding: content/bart-base-finetuned-xsum/checkpoint-50000/ (stored 0%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/training_args.bin (deflated 49%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/pytorch_model.bin (deflated 8%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/tokenizer.json (deflated 72%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/tokenizer_config.json (deflated 45%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/rng_state.pth (deflated 27%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/special_tokens_map.json (deflated 50%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/optimizer.pt (deflated 8%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/scaler.pt (deflated 55%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/scheduler.pt (deflated 49%)
  adding: content/bart-base-finetuned-xsum/checkpoint-50000/config.json (deflated 63%)
  adding: content/ba

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp bart-base-finetuned-xsum.zip '/content/drive/My Drive/weights/'

## Trying with a smaller dataset

In [None]:
# Select to get smaller dataset
small_train = raw_datasets['train'].select(list(range(0, 5000)))
small_val = raw_datasets['validation'].select(list(range(0, 500)))
small_train

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 5000
})

In [None]:
tokenized_train = small_train.map(preprocess_function, batched=True)
tokenized_val = small_val.map(preprocess_function, batched=True)
tokenized_train

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'document', 'id', 'input_ids', 'labels', 'summary'],
    num_rows: 5000
})

In [None]:
# Import original model in the cache
model_small = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/facebook/bart-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f5310d276a6d1648d00c32fadc8bf7b4607e0fbd5b404fc4a0045960aa2bdfdb.da0f3c0e2dc1c2fecc46738a1ebf4806f2fc36aae3d5c1947f21e063e7cab34b
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false

In [None]:
# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_small)

In [None]:
# Define traing args, batch size and epoch
# batch size max 16 on Colab Pro

batch_size = 6
epochs = 40
model_name = model_checkpoint.split("/")[-1]
args_small = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum-small",
    evaluation_strategy = "epoch",
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_strategy = "epoch",
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# Pass into the trainer

train_dataset=tokenized_train
eval_dataset=tokenized_val

trainer_small = Seq2SeqTrainer(
    model_small,
    args_small,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
trainer_small.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running training *****
  Num examples = 5000
  Num Epochs = 40
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 33360
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.5231,2.151397,32.3181,12.0723,26.3401,26.362,19.526
2,2.0232,2.188214,32.9806,12.7284,26.7077,26.7012,19.748
3,1.7118,2.232348,33.3302,12.3525,26.8634,26.8674,19.492
4,1.3852,2.326119,33.1956,12.2523,26.479,26.5324,19.806
5,1.1676,2.458862,33.7582,12.4151,27.2442,27.2582,19.648
6,0.9873,2.547485,32.5162,11.7455,26.0007,26.0014,19.728
7,0.7849,2.675,32.9153,12.1529,26.2095,26.2179,19.69
8,0.6511,2.756314,31.9502,11.5403,25.7578,25.7416,19.784
9,0.5612,2.836637,32.068,11.816,25.4903,25.4607,19.63
10,0.4317,2.931043,31.9349,11.0571,25.6695,25.644,19.88


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 6
Saving model checkpoint to bart-base-finetuned-xsum-small/checkpoint-834
Configuration saved in bart-base-finetuned-xsum-small/checkpoint-834/config.json
Model weights saved in bart-base-finetuned-xsum-small/checkpoint-834/pytorch_model.bin
tokenizer config file saved in bart-base-finetuned-xsum-small/checkpoint-834/tokenizer_config.json
Special tokens file saved in bart-base-finetuned-xsum-small/checkpoint-834/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: summary, id, document.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 6
Saving model checkpoint to bart-base-finetuned-xsum-small/checkpoint-1668
Configurati

KeyboardInterrupt: ignored

In [None]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▂▅▁▆▃▅▄▆▃▇█▆▇▅
eval/loss,▁▁▂▂▃▄▅▅▆▆▇▇██
eval/rouge1,▄▆▇▆█▄▆▃▃▃▃▃▁▄
eval/rouge2,▆█▇▇▇▅▆▄▅▃▄▃▁▄
eval/rougeL,▅▆▇▅█▄▄▃▂▂▂▁▁▄
eval/rougeLsum,▅▆▇▅█▄▄▃▂▂▂▁▁▄
eval/runtime,▅▆▆▇▄▄▅▅▄▆█▁▂▂
eval/samples_per_second,▄▃▃▂▅▄▄▄▅▃▁█▇▇
eval/steps_per_second,▄▃▃▂▅▅▄▅▅▃▁█▇▇
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
eval/gen_len,19.72
eval/loss,3.18722
eval/rouge1,32.3288
eval/rouge2,11.3217
eval/rougeL,26.0661
eval/rougeLsum,26.0653
eval/runtime,47.2056
eval/samples_per_second,10.592
eval/steps_per_second,1.779
train/epoch,14.39
