In [1]:
%load_ext autoreload
%autoreload 2

import os
import json

from src.datasets import IndoSum
from src.common import get_device
from src.indobart.base import get_model, get_tokenizer

import numpy as np
import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from accelerate import Accelerator

from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

In [2]:
accelerator = Accelerator()
device = accelerator.device
device

device(type='cuda')

In [3]:
nltk.download("all", quiet=True)

True

### Data Loading

In [4]:
indosum = IndoSum()
indosum.ds

DatasetDict({
    train: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 14262
    })
    test: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 3762
    })
    validation: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 750
    })
})

In [5]:
indosum.to_pd("train").head()

Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


### Topic Modeling

In [6]:
embedding_model = SentenceTransformer("LazarusNLP/all-indobert-base-v4")

stop_words = (
    stopwords.words("english")
    + stopwords.words("indonesian")
    + StopWordRemoverFactory().get_stop_words()
)
vectorizer_model = CountVectorizer(stop_words=stop_words, token_pattern="[^\W\d_]+")

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    nr_topics=10,
    verbose=True,
)

In [7]:
topics, probs = topic_model.fit_transform(indosum.ds["train"]["document"])

2024-11-12 19:47:19,384 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/446 [00:00<?, ?it/s]

2024-11-12 19:48:09,220 - BERTopic - Embedding - Completed ✓
2024-11-12 19:48:09,221 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-12 19:48:49,385 - BERTopic - Dimensionality - Completed ✓
2024-11-12 19:48:49,388 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-12 19:48:50,145 - BERTopic - Cluster - Completed ✓
2024-11-12 19:48:50,148 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-12 19:48:56,406 - BERTopic - Representation - Completed ✓
2024-11-12 19:48:56,416 - BERTopic - Topic reduction - Reducing number of topics
2024-11-12 19:49:01,762 - BERTopic - Topic reduction - Reduced number of topics from 203 to 10


In [9]:
topic_info = topic_model.get_topic_info()

# save to excel
os.makedirs(f"./results/00-bertopic-indobart", exist_ok=True)
topic_info.to_csv(f"./results/00-bertopic-indobart/topic_info.csv")

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6031,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat..."
1,0,1966,0_gol_pemain_laga_menit,"[gol, pemain, laga, menit, tim, pertandingan, ...",[JUARA.net - Real Madrid hanya butuh tambahan ...
2,1,1954,1_jakarta_kpk_partai_ketua,"[jakarta, kpk, partai, ketua, indonesia, jalan...","[Jakarta, CNN Indonesia - - Komisi Pemberantas..."
3,2,1169,2_pebalap_startup_rossi_teknologi,"[pebalap, startup, rossi, teknologi, berita, i...","[Jakarta, CNN Indonesia - - Pebalap Movistar Y..."
4,3,1092,3_film_lagu_album_konser,"[film, lagu, album, konser, jakarta, orang, in...","[Sulit memang, memilih siapa anggota One Direc..."
5,4,621,4_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, ameri...","[Jakarta, CNN Indonesia - - Presiden Amerika S..."
6,5,550,5_indonesia_persen_startup_rp,"[indonesia, persen, startup, rp, pemerintah, l...","[Jakarta, CNN Indonesia - - Menteri Energi dan..."
7,6,443,6_indonesia_kopi_makanan_festival,"[indonesia, kopi, makanan, festival, wisata, p...",[London (ANTARA News) - Duo desainer Italia Mi...
8,7,423,7_tubuh_makanan_kanker_penyakit,"[tubuh, makanan, kanker, penyakit, orang, pene...","[Selain jantung dan stroke, kanker merupakan p..."
9,8,13,8_minyak_barel_mentah_opec,"[minyak, barel, mentah, opec, harga, produksi,...","[Jakarta, CNN Indonesia - - Harga minyak menta..."


In [None]:
topic_document_info = topic_model.get_document_info(indosum.ds["train"]["document"], indosum.to_pd("train"))

topic_document_info.to_csv(f"./results/00-bertopic-indobart/topic_document_info.csv")

topic_document_info

Unnamed: 0,document,id,summary,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",-1,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat...",indonesia - jakarta - orang - pemain - salah -...,0.000000,False
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...,Selfie ialah salah satu tema terpanas di kalan...,-1,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat...",indonesia - jakarta - orang - pemain - salah -...,0.000000,False
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",-1,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat...",indonesia - jakarta - orang - pemain - salah -...,0.000000,False
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...,Merdeka.com - Indonesia Corruption Watch (ICW)...,-1,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat...",indonesia - jakarta - orang - pemain - salah -...,0.000000,False
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1,1_jakarta_kpk_partai_ketua,"[jakarta, kpk, partai, ketua, indonesia, jalan...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - kpk - partai - ketua - indonesia - j...,0.918330,False
...,...,...,...,...,...,...,...,...,...,...,...
14257,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",1497645345-as-kirimkan-peluncur-rudal-ke-suria...,Amerika Serikat dilaporkan telah mengirimkan s...,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",4,4_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, ameri...","[Jakarta, CNN Indonesia - - Presiden Amerika S...",trump - negara - presiden - israel - orang - a...,0.450864,False
14258,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",1495406700-borneo-bersyukur-tahan-persib-di-gbla,Borneo FC menahan imbang Persib Bandung pada l...,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",-1,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat...",indonesia - jakarta - orang - pemain - salah -...,0.000000,False
14259,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1513941815-mantan-dirjen-perhubungan-laut-sege...,Komisi Pemberantasan Korupsi (KPK) sudah melim...,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1,1_jakarta_kpk_partai_ketua,"[jakarta, kpk, partai, ketua, indonesia, jalan...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - kpk - partai - ketua - indonesia - j...,1.000000,False
14260,Merdeka.com - Sebuah kabar gembira datang bagi...,1496440800-rangking-fifa-indonesia-naik-dua-pe...,Kabar gembira datang bagi sepakbola Indone...,Merdeka.com - Sebuah kabar gembira datang bagi...,-1,-1_indonesia_jakarta_orang_pemain,"[indonesia, jakarta, orang, pemain, salah, neg...","[Jakarta, CNN Indonesia - - Indonesia mencatat...",indonesia - jakarta - orang - pemain - salah -...,0.000000,False


In [None]:
def add_topic(example, idx):
    # if already have <tag>, return the example
    if "<tag>" in example["document"]:
        return example

    curr_topic = " ".join(topic_document_info["Representation"].values[idx])
    example["document"] = f"<tag> {curr_topic} <tag> {example['document']}"
    
    return example

new_ds = indosum.ds
new_ds["train"] = new_ds["train"].map(add_topic, with_indices=True, num_proc=os.cpu_count())

indosum.update(new_ds)

Map (num_proc=8):   0%|          | 0/14262 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 14262
    })
    test: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 3762
    })
    validation: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 750
    })
})

In [12]:
print(json.dumps(indosum.ds["train"][:5], indent=4))

{
    "document": [
        "<tag> indonesia jakarta orang pemain salah negara anak film tim memiliki <tag> Jakarta, CNN Indonesia - - Dokter Ryan Thamrin, yang terkenal lewat acara Dokter Oz Indonesia, meninggal dunia pada Jumat (4 / 8) dini hari. Dokter Lula Kamal yang merupakan selebriti sekaligus rekan kerja Ryan menyebut kawannya itu sudah sakit sejak setahun yang lalu. Lula menuturkan, sakit itu membuat Ryan mesti vakum dari semua kegiatannya, termasuk menjadi pembawa acara Dokter Oz Indonesia. Kondisi itu membuat Ryan harus kembali ke kampung halamannya di Pekanbaru, Riau untuk menjalani istirahat. \" Setahu saya dia orangnya sehat, tapi tahun lalu saya dengar dia sakit. (Karena) sakitnya, ia langsung pulang ke Pekanbaru, jadi kami yang mau jenguk juga susah. Barangkali mau istirahat, ya betul juga, kalau di Jakarta susah isirahatnya, \" kata Lula kepada CNNIndonesia.com, Jumat (4 / 8). Lula yang mengenal Ryan sejak sebelum aktif berkarier di televisi mengaku belum sempat membes

### Load Model

In [13]:
model = get_model()
tokenizer = get_tokenizer()

tokenizer.add_special_tokens({"additional_special_tokens": ["<tag>"]})

1

In [14]:
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(40004, 768, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(40004, 768, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [15]:
tokenizer

IndoNLGTokenizer(name_or_path='indobenchmark/indobart-v2', vocab_size=40004, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<tag>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	39942: AddedToken("<tag>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40003: AddedToken("<m

### Train Model

In [16]:
# Setup evaluation
metric = evaluate.load("rouge")

#### Preparation

In [17]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result

tokenized_ds = indosum.ds.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

def train_model(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length):
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir + "/checkpoint",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_batch_size,
        per_device_eval_batch_size=per_device_batch_size,
        weight_decay=0.01,
        num_train_epochs=num_train_epochs,
        fp16=True,
        predict_with_generate=True,
        generation_max_length=generation_max_length,
        log_level="info",
        logging_first_step=True,
        logging_dir=output_dir + "/logs",
        resume_from_checkpoint=True,
        save_total_limit=1,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return trainer
    
def evaluate_model(trainer):
    eval_results = trainer.evaluate(eval_dataset=tokenized_ds["test"])
    return eval_results


def train_and_evaluate(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length):
    trainer = train_model(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length)
    eval_results = evaluate_model(trainer)
    
    return trainer, eval_results


Map:   0%|          | 0/14262 [00:00<?, ? examples/s]

Map:   0%|          | 0/3762 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

#### Training & Evaluation

Try multiple generation max length with the rest parameters fixed.
Observes the best score and the corresponding generation max length.

In [18]:
experiments = []

for i in range(1, 6):
    generation_max_length = 50 + i * 10
    experiments.append({
        "output_dir": f"./results/00-bertopic-indobart/0{i}",
        "per_device_batch_size": 8,
        "learning_rate": 3.75e-5,
        "num_train_epochs": 3,
        "generation_max_length": generation_max_length
    })

for exp in experiments:
    os.makedirs(exp["output_dir"], exist_ok=True)
    
    trainer, eval_results = train_and_evaluate(
        exp["output_dir"],
        exp["per_device_batch_size"],
        exp["learning_rate"],
        exp["num_train_epochs"],
        exp["generation_max_length"]
    )
    
    # print params and the results
    print("=== Results for experiment ===")
    print("-- Params --") 
    print(json.dumps(exp, indent=4))
    print("-- Eval results --")
    print(json.dumps(eval_results, indent=4))
    
    # save mapping between params and results
    with open(exp["output_dir"] + "/params.json", "w") as f:
        json.dump(exp, f)
    
    with open(exp["output_dir"] + "/eval_results.json", "w") as f:
        json.dump(eval_results, f)



Using auto half precision backend
The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5467,0.529217,0.654519,0.580659,0.622402,0.645601
2,0.4174,0.519327,0.649801,0.576529,0.617606,0.641114
3,0.3287,0.523548,0.643397,0.568776,0.610196,0.634588


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1
}

Saving model checkpoint to ./results/00-bertopic-indobart/01/checkpoint/checkpoint-1783
Configuration saved in ./results/00-bertopic-indobart/01/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-bertopic-indobart/01/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-bertopic-indobart/01/checkpoint/checkpoint-1783/model.safetensors
tokenizer con

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-bertopic-indobart/01",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 60
}
-- Eval results --
{
    "eval_loss": 0.5529609322547913,
    "eval_rouge1": 0.6364351208505599,
    "eval_rouge2": 0.5605365068088425,
    "eval_rougeL": 0.6020757657731941,
    "eval_rougeLsum": 0.6271883521878868,
    "eval_runtime": 793.0146,
    "eval_samples_per_second": 4.744,
    "eval_steps_per_second": 0.594,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.2753,0.576374,0.67234,0.593267,0.636784,0.663334
2,0.2172,0.578689,0.663294,0.58362,0.626504,0.654402
3,0.186,0.584841,0.665344,0.585009,0.628543,0.656722


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-bertopic-indobart/02/checkpoint/checkpoint-1783
Configuration saved in ./results/00-bertopic-indobart/02/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-bertopic-indobart/02/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-bertopic-indobart/02/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-bertopic-indobart/02/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-bertopic-indobart/02/checkpoint/checkpoint-1783/special_tokens_map.json
The following col

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-bertopic-indobart/02",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 70
}
-- Eval results --
{
    "eval_loss": 0.6192710995674133,
    "eval_rouge1": 0.6510743666734569,
    "eval_rouge2": 0.5681992626386996,
    "eval_rougeL": 0.6118210096565171,
    "eval_rougeLsum": 0.6414583163780941,
    "eval_runtime": 901.8765,
    "eval_samples_per_second": 4.171,
    "eval_steps_per_second": 0.522,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.1099,0.666665,0.670484,0.585737,0.629444,0.660598
2,0.0971,0.661128,0.656109,0.569178,0.61323,0.646373
3,0.1029,0.655045,0.665903,0.580359,0.624929,0.656474


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-bertopic-indobart/03/checkpoint/checkpoint-1783
Configuration saved in ./results/00-bertopic-indobart/03/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-bertopic-indobart/03/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-bertopic-indobart/03/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-bertopic-indobart/03/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-bertopic-indobart/03/checkpoint/checkpoint-1783/special_tokens_map.json
The following col

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-bertopic-indobart/03",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 80
}
-- Eval results --
{
    "eval_loss": 0.6935704946517944,
    "eval_rouge1": 0.6526937238002135,
    "eval_rouge2": 0.5666987399749325,
    "eval_rougeL": 0.610787988799163,
    "eval_rougeLsum": 0.642665223757774,
    "eval_runtime": 1021.7749,
    "eval_samples_per_second": 3.682,
    "eval_steps_per_second": 0.461,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0429,0.750961,0.642532,0.553309,0.596652,0.632901
2,0.0432,0.737981,0.647371,0.559988,0.603214,0.638072
3,0.0631,0.715888,0.643812,0.555343,0.598235,0.634594


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-bertopic-indobart/04/checkpoint/checkpoint-1783
Configuration saved in ./results/00-bertopic-indobart/04/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-bertopic-indobart/04/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-bertopic-indobart/04/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-bertopic-indobart/04/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-bertopic-indobart/04/checkpoint/checkpoint-1783/special_tokens_map.json
The following col

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using auto half precision backend


=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-bertopic-indobart/04",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 90
}
-- Eval results --
{
    "eval_loss": 0.7565310597419739,
    "eval_rouge1": 0.6366294830203212,
    "eval_rouge2": 0.5474059345855151,
    "eval_rougeL": 0.5913407357360366,
    "eval_rougeLsum": 0.6267238925254527,
    "eval_runtime": 1172.786,
    "eval_samples_per_second": 3.208,
    "eval_steps_per_second": 0.402,
    "epoch": 3.0
}


The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0237,0.820262,0.639454,0.551907,0.592439,0.629958
2,0.0241,0.79739,0.640681,0.553928,0.597334,0.631642
3,0.0413,0.763383,0.62898,0.539906,0.582585,0.619641


The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, document, id. If summary, document, id are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-bertopic-indobart/05/checkpoint/checkpoint-1783
Configuration saved in ./results/00-bertopic-indobart/05/checkpoint/checkpoint-1783/config.json
Configuration saved in ./results/00-bertopic-indobart/05/checkpoint/checkpoint-1783/generation_config.json
Model weights saved in ./results/00-bertopic-indobart/05/checkpoint/checkpoint-1783/model.safetensors
tokenizer config file saved in ./results/00-bertopic-indobart/05/checkpoint/checkpoint-1783/tokenizer_config.json
Special tokens file saved in ./results/00-bertopic-indobart/05/checkpoint/checkpoint-1783/special_tokens_map.json
The following col

=== Results for experiment ===
-- Params --
{
    "output_dir": "./results/00-bertopic-indobart/05",
    "per_device_batch_size": 8,
    "learning_rate": 3.75e-05,
    "num_train_epochs": 3,
    "generation_max_length": 100
}
-- Eval results --
{
    "eval_loss": 0.809755802154541,
    "eval_rouge1": 0.6115544682348182,
    "eval_rouge2": 0.5186660587803549,
    "eval_rougeL": 0.5618226831013233,
    "eval_rougeLsum": 0.6011072870063748,
    "eval_runtime": 1339.4461,
    "eval_samples_per_second": 2.809,
    "eval_steps_per_second": 0.352,
    "epoch": 3.0
}
