In [1]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset

import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer

In [None]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("using cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = (
        "1"  # This is tracked as pytorch issue #98222
    )
    print("using mps")
else:
    device = torch.device("cpu")
    print("using cpu")


using cuda


### Load datasets

In [None]:
# ds = load_dataset('maryantocinn/indosum')
ds = load_dataset("./repo/SEACrowd/indosum/indosum.py")

# show first 5 data from the dataset in pandas like table
pd.DataFrame(ds["train"][:5]).head()

Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


In [None]:
# check the length of the dataset
print("Train dataset length: ", len(ds["train"]))
print("Validation dataset length: ", len(ds["validation"]))
print("Test dataset length: ", len(ds["test"]))

Train dataset length:  14262
Validation dataset length:  750
Test dataset length:  3762


### Load Model

In [None]:
# bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart-v2')

bart_model = AutoModelForSeq2SeqLM.from_pretrained("indobenchmark/indobart-v2")
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")

model = bart_model
tokenizer = indonlg_tokenizer

### Train Model

In [None]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result


tokenized_ds = ds.map(preprocess_function, batched=True)

# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

per_device_batch_size = 8  # 8 for low hardware spec
output_dir = "./results/00-indobart"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5,  # hf example: 2e-5
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,  # hf example: 2
    fp16=True,  # comment this if using mps/apple sillicon chip (not supported)
    predict_with_generate=True,
    generation_max_length=80,
    log_level="info",
    logging_first_step=True,
    resume_from_checkpoint=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer,  # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using auto half precision backend


In [7]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5547,0.518261,0.693098,0.615,0.659614,0.684784
2,0.4225,0.501937,0.686966,0.60792,0.654286,0.678254
3,0.3343,0.509772,0.687822,0.608429,0.6548,0.679702


Saving model checkpoint to ./results/00-indobart/checkpoint-1000
Configuration saved in ./results/00-indobart/checkpoint-1000/config.json
Configuration saved in ./results/00-indobart/checkpoint-1000/generation_config.json
Model weights saved in ./results/00-indobart/checkpoint-1000/model.safetensors
tokenizer config file saved in ./results/00-indobart/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/00-indobart/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-indobart/checkpoint-2000
Configuration saved in ./results/00-indobart/checkpoint-2000/config.json
Configuration sav

TrainOutput(global_step=5349, training_loss=0.4577951965030721, metrics={'train_runtime': 1317.9973, 'train_samples_per_second': 32.463, 'train_steps_per_second': 4.058, 'total_flos': 1.5555268214194176e+16, 'train_loss': 0.4577951965030721, 'epoch': 3.0})

### Predict Test Data and evaluate the score

In [None]:
# Generate predictions
test_predictions = trainer.predict(tokenized_ds["test"])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", rouge_scores)

The following columns in the test set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: id, document, summary. If id, document, summary are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 3762
  Batch size = 8


ROUGE scores on the test set: {'rouge1': np.float64(0.6808638082877687), 'rouge2': np.float64(0.6014193813641668), 'rougeL': np.float64(0.6458487620938582), 'rougeLsum': np.float64(0.6719528217041734)}


### Pipeline summary

In [9]:
# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# create table to show the result: document, summary, generated_summary
df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
for i in range(100):
    document = ds['test'][i]['document']
    summary = ds['test'][i]['summary']
    generated_summary = summarizer(document, min_length=5, max_length=80)
    df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)

# Specify the directory and file path
directory = 'benc_result/00-indobart/'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the DataFrame to a CSV file
df.to_csv(f'{directory}/summarization_result.csv')
df.to_json(f'{directory}/summarization_result.json')

df.head()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,document,summary,generated_summary
0,"Jakarta, CNN Indonesia - - Dilansir AFP, seora...",Eman Ahmed Abd El Aty memiliki berat badan men...,seorang warga mesir yang dipercaya sebagai wa...
1,Menteri Pertahanan Ryamizard Ryacudu menyambut...,Menteri Pertahanan Ryamizard Ryacudu menyambut...,menteri pertahanan ryamizard ryacudu menyambu...
2,"Jakarta, CNN Indonesia - - Meski sudah hampir ...",Rumah produksi film yang dibintangi Lindsay Lo...,"meski sudah hampir 12 tahun berlalu, film mea..."
3,"Usai melaksanakan ibadah haji, Eggi Sudjana ak...",Eggi Sudjana akhirnya mendatangi kantor Baresk...,"setelah melaksanakan ibadah haji, eggi sudjan..."
4,Banyak cara untuk memberikan pengajaran kepada...,Game permainan Kartu Muslim. Menggunakan basis...,terdapat cara untuk memberikan pengajaran kep...
