In [1]:
%load_ext autoreload
%autoreload 2

import os
import json

from src.datasets import IndoSum
from src.common import get_device
from src.indobart.base import get_model, get_tokenizer

import numpy as np
import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


from accelerate import Accelerator

In [2]:
accelerator = Accelerator()
device = accelerator.device
device

device(type='cuda')

### Data Loading

In [3]:
indosum = IndoSum()
indosum.ds

DatasetDict({
    train: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 14262
    })
    test: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 3762
    })
    validation: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 750
    })
})

In [4]:
indosum.to_pd("train").head()

Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


### Load Model

In [5]:
model = get_model()
tokenizer = get_tokenizer()

In [6]:
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(40004, 768, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(40004, 768, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [7]:
tokenizer

IndoNLGTokenizer(name_or_path='indobenchmark/indobart-v2', vocab_size=40004, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<mask>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40003: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

### Train Model

In [8]:
# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")

#### Preparation

In [10]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result

tokenized_ds = indosum.ds.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

def train_model(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length):
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir + "/checkpoint",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_batch_size,
        per_device_eval_batch_size=per_device_batch_size,
        weight_decay=0.01,
        num_train_epochs=num_train_epochs,
        fp16=True,
        predict_with_generate=True,
        generation_max_length=generation_max_length,
        log_level="info",
        logging_first_step=True,
        logging_dir=output_dir + "/logs",
        resume_from_checkpoint=True,
        save_total_limit=1,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return trainer
    
def evaluate_model(trainer):
    eval_results = trainer.evaluate(eval_dataset=tokenized_ds["test"])
    return eval_results


def train_and_evaluate(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length):
    trainer = train_model(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length)
    eval_results = evaluate_model(trainer)
    
    return trainer, eval_results


#### Training & Evaluation

Try multiple generation max length with the rest parameters fixed.
Observes the best score and the corresponding generation max length.

In [11]:
experiments = []

for i in range(1, 6):
    generation_max_length = 50 + i * 10
    experiments.append({
        "output_dir": f"./results/00-indobart/0{i}",
        "per_device_batch_size": 8,
        "learning_rate": 3.75e-5,
        "num_train_epochs": 3,
        "generation_max_length": generation_max_length
    })

for exp in experiments:
    os.makedirs(exp["output_dir"], exist_ok=True)
    
    trainer, eval_results = train_and_evaluate(
        exp["output_dir"],
        exp["per_device_batch_size"],
        exp["learning_rate"],
        exp["num_train_epochs"],
        exp["generation_max_length"]
    )
    
    # print params and the results
    print("=== Results for experiment ===")
    print("-- Params --") 
    print(json.dumps(exp, indent=4))
    print("-- Eval results --")
    print(json.dumps(eval_results, indent=4))
    
    # save mapping between params and results
    with open(exp["output_dir"] + "/params.json", "w") as f:
        json.dump(exp, f)
    
    with open(exp["output_dir"] + "/eval_results.json", "w") as f:
        json.dump(eval_results, f)



Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.554,0.517173,0.667031,0.597789,0.639028,0.659747
2,0.4231,0.502262,0.668084,0.597467,0.640838,0.660474
3,0.3339,0.509274,0.667058,0.596807,0.640493,0.659551


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

Saving model checkpoint to ./results/00-indobart/01/checkpoint/checkpoint-1783
Configuration saved in ./results/00-indobart/01/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-indobart/01/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-indobart/01/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-indob

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-indobart/01",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 60
}
-- Eval results --
{
    "eval_loss": 0.5376449823379517,
    "eval_rouge1": 0.6578535232670708,
    "eval_rouge2": 0.5860617703657387,
    "eval_rougeL": 0.6272031590547716,
    "eval_rougeLsum": 0.6492784059277354,
    "eval_runtime": 827.9129,
    "eval_samples_per_second": 4.544,
    "eval_steps_per_second": 0.569,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.279,0.556184,0.683701,0.606891,0.652541,0.675584
2,0.2204,0.556206,0.69068,0.613901,0.660262,0.682306
3,0.1893,0.56692,0.691895,0.61609,0.66177,0.683992


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-indobart/02/checkpoint/checkpoint-1783
Configuration saved in ./results/00-indobart/02/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-indobart/02/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-indobart/02/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-indobart/02/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-indobart/02/checkpoint/checkpoint-1783/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-indobart/02",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 70
}
-- Eval results --
{
    "eval_loss": 0.5976317524909973,
    "eval_rouge1": 0.682377584319096,
    "eval_rouge2": 0.6055809609561018,
    "eval_rougeL": 0.6490064606746873,
    "eval_rougeLsum": 0.6728720102402448,
    "eval_runtime": 899.077,
    "eval_samples_per_second": 4.184,
    "eval_steps_per_second": 0.524,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.1133,0.638636,0.6815,0.6006,0.646978,0.673668
2,0.0989,0.644547,0.689555,0.610778,0.656815,0.680999
3,0.1049,0.639322,0.683673,0.603201,0.650882,0.675437


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-indobart/03/checkpoint/checkpoint-1783
Configuration saved in ./results/00-indobart/03/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-indobart/03/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-indobart/03/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-indobart/03/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-indobart/03/checkpoint/checkpoint-1783/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-indobart/03",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 80
}
-- Eval results --
{
    "eval_loss": 0.6712408065795898,
    "eval_rouge1": 0.6813039421192061,
    "eval_rouge2": 0.600212172181025,
    "eval_rougeL": 0.6452167875185628,
    "eval_rougeLsum": 0.6714865512581554,
    "eval_runtime": 1106.0189,
    "eval_samples_per_second": 3.401,
    "eval_steps_per_second": 0.426,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0449,0.734305,0.664274,0.581619,0.628552,0.655224
2,0.0447,0.735515,0.675261,0.594191,0.640338,0.667221
3,0.0631,0.699859,0.673045,0.5909,0.637685,0.664565


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-indobart/04/checkpoint/checkpoint-1783
Configuration saved in ./results/00-indobart/04/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-indobart/04/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-indobart/04/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-indobart/04/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-indobart/04/checkpoint/checkpoint-1783/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-indobart/04",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 90
}
-- Eval results --
{
    "eval_loss": 0.7304822206497192,
    "eval_rouge1": 0.671626285419908,
    "eval_rouge2": 0.5889033857964887,
    "eval_rougeL": 0.6343955226105362,
    "eval_rougeLsum": 0.6622529154483132,
    "eval_runtime": 1259.7242,
    "eval_samples_per_second": 2.986,
    "eval_steps_per_second": 0.374,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0235,0.793746,0.655516,0.573237,0.619078,0.647701
2,0.0247,0.772315,0.658569,0.576196,0.621591,0.649946
3,0.0408,0.744207,0.656982,0.574017,0.620212,0.648578


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-indobart/05/checkpoint/checkpoint-1783
Configuration saved in ./results/00-indobart/05/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-indobart/05/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-indobart/05/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-indobart/05/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-indobart/05/checkpoint/checkpoint-1783/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding 

=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-indobart/05",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 100
}
-- Eval results --
{
    "eval_loss": 0.7771861553192139,
    "eval_rouge1": 0.6505126244879144,
    "eval_rouge2": 0.5673431688996508,
    "eval_rougeL": 0.6124387532072134,
    "eval_rougeLsum": 0.641838960450171,
    "eval_runtime": 1295.8065,
    "eval_samples_per_second": 2.903,
    "eval_steps_per_second": 0.363,
    "epoch": 3.0
}
