In [8]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import MBartForConditionalGeneration
from torch import optim

import nltk
import evaluate
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer

In [13]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device('cpu')

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('using cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "1" # This is tracked as pytorch issue #98222
    print('using mps')
else:
    device = torch.device('cpu')
    print('using cpu')


using cuda


### Load datasets

In [10]:
ds = load_dataset('./repo/SEACrowd/indosum/indosum.py')
# ds = load_dataset('maryantocinn/indosum')

# data_train = ds['train']
# data_val = ds['validation']
# data_test = ds['test']

# show first 5 data from the dataset in pandas like table
pd.DataFrame(ds['train'][:5]).head()


Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,,Jokowi memimpin upacara penurunan bendera. Usa...


In [None]:
# check the length of the dataset
print("Train dataset length: ", len(ds['train']))
print("Validation dataset length: ", len(ds['validation']))
print("Test dataset length: ", len(ds['test']))

Train dataset length:  14262
Validation dataset length:  750
Test dataset length:  3762


### Load Model

In [11]:
# bart_model = AutoModelForSeq2SeqLM.from_pretrained('indobenchmark/indobart-v2')

bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart-v2')
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart-v2')

model = bart_model
tokenizer = indonlg_tokenizer

### Train Model

In [None]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

tokenized_ds = ds.map(preprocess_function, batched=True)

# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results/00-indobart",
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5, # hf example: 2e-5
    per_device_train_batch_size=8, # hf example: 16
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3, # hf example: 2
    fp16=True, # not supported on apple sillicon chip (mps)
    predict_with_generate=True,
    # save_strategy="no",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer, # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5547,0.521733,0.337352,0.295579,0.328877,0.333751
2,0.4236,0.503219,0.340289,0.298195,0.332033,0.33667
3,0.3341,0.509233,0.340637,0.29879,0.332058,0.336892




TrainOutput(global_step=5349, training_loss=0.45868174235026044, metrics={'train_runtime': 1217.706, 'train_samples_per_second': 35.137, 'train_steps_per_second': 4.393, 'total_flos': 1.6099778535284736e+16, 'train_loss': 0.45868174235026044, 'epoch': 3.0})

### Predict Test Data and evaluate the score

In [None]:
# Generate predictions
test_predictions = trainer.predict(tokenized_ds['test'])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", rouge_scores)



ROUGE scores on the test set: {'rouge1': np.float64(0.33772576546706445), 'rouge2': np.float64(0.2954139183415241), 'rougeL': np.float64(0.32883016702331247), 'rougeLsum': np.float64(0.33381442413528517)}


{'eval_loss': 0.5360028147697449,
 'eval_rouge1': 0.33772576546706445,
 'eval_rouge2': 0.2954139183415241,
 'eval_rougeL': 0.32883016702331247,
 'eval_rougeLsum': 0.33381442413528517,
 'eval_runtime': 342.2997,
 'eval_samples_per_second': 10.99,
 'eval_steps_per_second': 2.749,
 'epoch': 3.0}

### Pipeline summary

In [None]:
# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# create table to show the result: document, summary, generated_summary
df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
for i in range(5):
    document = ds['test'][i]['document']
    summary = ds['test'][i]['summary']
    generated_summary = summarizer(document, min_length=5, max_length=80)
    df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)

df.head()

# save the result to a csv file
# Specify the directory and file path
directory = 'benc_result/00-indobart/'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the DataFrame to a CSV file
df.to_json(f'{directory}/summarization_result.json')