In [None]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset

import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer

In [None]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device('cpu')

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('using cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "1" # This is tracked as pytorch issue #98222
    print('using mps')
else:
    device = torch.device('cpu')
    print('using cpu')


### Load datasets

In [None]:
# ds = load_dataset('maryantocinn/indosum')
ds = load_dataset('./repo/SEACrowd/indosum/indosum.py')

# show first 5 data from the dataset in pandas like table
pd.DataFrame(ds['train'][:5]).head()

In [None]:
# check the length of the dataset
print("Train dataset length: ", len(ds['train']))
print("Validation dataset length: ", len(ds['validation']))
print("Test dataset length: ", len(ds['test']))

### Load Model

In [None]:
# bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart-v2')

bart_model = AutoModelForSeq2SeqLM.from_pretrained('indobenchmark/indobart-v2')
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart-v2')

model = bart_model
tokenizer = indonlg_tokenizer

### Train Model

In [None]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

tokenized_ds = ds.map(preprocess_function, batched=True)

# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

per_device_batch_size = 8 # 8 for low hardware spec

training_args = Seq2SeqTrainingArguments(
    output_dir="./results/00-indobart",
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5, # hf example: 2e-5
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3, # hf example: 2
    fp16=True, # comment this if using mps/apple sillicon chip (not supported)
    predict_with_generate=True,
    generation_max_length=80,
    log_level="info",
    logging_first_step=True,    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer, # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train(resume_from_checkpoint=True)

### Predict Test Data and evaluate the score

In [None]:
# Generate predictions
test_predictions = trainer.predict(tokenized_ds['test'])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", rouge_scores)

### Pipeline summary

In [None]:
# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# create table to show the result: document, summary, generated_summary
df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
for i in range(100):
    document = ds['test'][i]['document']
    summary = ds['test'][i]['summary']
    generated_summary = summarizer(document, min_length=5, max_length=80)
    df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)

# Specify the directory and file path
directory = 'benc_result/00-indobart/'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the DataFrame to a CSV file
df.to_csv(f'{directory}/summarization_result.csv')
df.to_json(f'{directory}/summarization_result.json')

df.head()