In [None]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
import json

import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from transformers import BertTokenizer, AutoModel

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer

In [2]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("using cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    # os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "1" # This is tracked as pytorch issue #98222
    print("using mps")
else:
    device = torch.device("cpu")
    print("using cpu")


using cuda


In [None]:
# Specify the directory and file path

base_dir = "result/01-indobart-bertopic"

benc_dir = f"{base_dir}/benc/"

# Create the benc_dir if it doesn't exist
os.makedirs(benc_dir, exist_ok=True)

### Load datasets

In [15]:
ds = load_dataset("./repo/SEACrowd/indosum/indosum.py")

# check the length of the dataset
print("Train dataset length: ", len(ds["train"]))
print("Validation dataset length: ", len(ds["validation"]))
print("Test dataset length: ", len(ds["test"]))
print("")

# explore the first 5 data in the dataset
print(json.dumps(ds["train"][:5], indent=4))
print("")

Train dataset length:  14262
Validation dataset length:  750
Test dataset length:  3762

{
    "document": [
        "Jakarta, CNN Indonesia - - Dokter Ryan Thamrin, yang terkenal lewat acara Dokter Oz Indonesia, meninggal dunia pada Jumat (4 / 8) dini hari. Dokter Lula Kamal yang merupakan selebriti sekaligus rekan kerja Ryan menyebut kawannya itu sudah sakit sejak setahun yang lalu. Lula menuturkan, sakit itu membuat Ryan mesti vakum dari semua kegiatannya, termasuk menjadi pembawa acara Dokter Oz Indonesia. Kondisi itu membuat Ryan harus kembali ke kampung halamannya di Pekanbaru, Riau untuk menjalani istirahat. \" Setahu saya dia orangnya sehat, tapi tahun lalu saya dengar dia sakit. (Karena) sakitnya, ia langsung pulang ke Pekanbaru, jadi kami yang mau jenguk juga susah. Barangkali mau istirahat, ya betul juga, kalau di Jakarta susah isirahatnya, \" kata Lula kepada CNNIndonesia.com, Jumat (4 / 8). Lula yang mengenal Ryan sejak sebelum aktif berkarier di televisi mengaku belum sem

In [17]:
nltk.download("all", quiet=True)

True

### Topic Modelling

In [None]:
embedding_model = SentenceTransformer("LazarusNLP/all-indobert-base-v4")

stop_words = (
    stopwords.words("indonesian")
    + StopWordRemoverFactory().get_stop_words()
)
vectorizer_model = CountVectorizer(stop_words=stop_words, token_pattern="[^\W\d_]+")

topic_model = BERTopic(
    language="multilingual",
    # embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    nr_topics=10,
    verbose=True,
)

In [8]:
topics, probs = topic_model.fit_transform(ds["train"]["document"])

2024-11-04 09:57:31,630 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/446 [00:00<?, ?it/s]

2024-11-04 09:58:03,091 - BERTopic - Embedding - Completed ✓
2024-11-04 09:58:03,093 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-04 09:58:44,084 - BERTopic - Dimensionality - Completed ✓
2024-11-04 09:58:44,087 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-04 09:58:45,053 - BERTopic - Cluster - Completed ✓
2024-11-04 09:58:45,055 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-04 09:58:51,836 - BERTopic - Representation - Completed ✓
2024-11-04 09:58:51,842 - BERTopic - Topic reduction - Reducing number of topics
2024-11-04 09:58:56,518 - BERTopic - Topic reduction - Reduced number of topics from 206 to 10


In [None]:
topic_info = topic_model.get_topic_info()

# save to excel
topic_info.to_csv(benc_dir + "topic_info.csv")

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4877,-1_indonesia_jakarta_orang_anak,"[indonesia, jakarta, orang, anak, salah, negar...",[Merdeka.com - Calon Gubernur DKI Jakarta no...
1,0,3632,0_jakarta_indonesia_kpk_negara,"[jakarta, indonesia, kpk, negara, presiden, pe...","[Jakarta, CNN Indonesia - - Komisi Pemberantas..."
2,1,2738,1_pemain_gol_menit_laga,"[pemain, gol, menit, laga, tim, musim, liga, p...",[Tragis. Satu kata yang cukup mendeskripsikan ...
3,2,1472,2_film_lagu_indonesia_album,"[film, lagu, indonesia, album, jakarta, orang,...","[Kelihatannya, film horor kembali booming di t..."
4,3,773,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, pengguna...",[Facebook baru saja menggulirkan pembaruan yan...
5,4,462,4_makanan_tubuh_kulit_orang,"[makanan, tubuh, kulit, orang, kopi, kanker, k...",[Siapa yang tak menyukai si manis gula? Gula m...
6,5,183,5_mobil_motor_kendaraan_toyota,"[mobil, motor, kendaraan, toyota, unit, listri...","[Jakarta, CNN Indonesia - - PT Toyota Astra Mo..."
7,6,88,6_harga_persen_minyak_inflasi,"[harga, persen, minyak, inflasi, saham, rp, do...","[Jakarta, CNN Indonesia - - Indeks Harga Saham..."
8,7,26,7_macron_spanyol_catalonia_kemerdekaan,"[macron, spanyol, catalonia, kemerdekaan, pera...","[Jakarta, CNN Indonesia - - Presiden Catalonia..."
9,8,11,8_pizza_topping_gaun_orang,"[pizza, topping, gaun, orang, neapolitan, piza...",[Hampir seluruh orang di dunia pasti pernah me...


In [None]:
topic_document_info = topic_model.get_document_info(ds["train"]["document"], ds["train"].to_pandas())

topic_document_info

Unnamed: 0,document,id,summary,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",2,2_film_lagu_indonesia_album,"[film, lagu, indonesia, album, jakarta, orang,...","[Kelihatannya, film horor kembali booming di t...",film - lagu - indonesia - album - jakarta - or...,1.000000,False
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...,Selfie ialah salah satu tema terpanas di kalan...,-1,-1_indonesia_jakarta_orang_anak,"[indonesia, jakarta, orang, anak, salah, negar...",[Merdeka.com - Calon Gubernur DKI Jakarta no...,indonesia - jakarta - orang - anak - salah - n...,0.000000,False
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1,1_pemain_gol_menit_laga,"[pemain, gol, menit, laga, tim, musim, liga, p...",[Tragis. Satu kata yang cukup mendeskripsikan ...,pemain - gol - menit - laga - tim - musim - li...,0.527291,False
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...,Merdeka.com - Indonesia Corruption Watch (ICW)...,0,0_jakarta_indonesia_kpk_negara,"[jakarta, indonesia, kpk, negara, presiden, pe...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - indonesia - kpk - negara - presiden ...,1.000000,False
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,0,0_jakarta_indonesia_kpk_negara,"[jakarta, indonesia, kpk, negara, presiden, pe...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - indonesia - kpk - negara - presiden ...,0.688217,False
...,...,...,...,...,...,...,...,...,...,...,...
14257,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",1497645345-as-kirimkan-peluncur-rudal-ke-suria...,Amerika Serikat dilaporkan telah mengirimkan s...,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",0,0_jakarta_indonesia_kpk_negara,"[jakarta, indonesia, kpk, negara, presiden, pe...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - indonesia - kpk - negara - presiden ...,1.000000,False
14258,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",1495406700-borneo-bersyukur-tahan-persib-di-gbla,Borneo FC menahan imbang Persib Bandung pada l...,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",1,1_pemain_gol_menit_laga,"[pemain, gol, menit, laga, tim, musim, liga, p...",[Tragis. Satu kata yang cukup mendeskripsikan ...,pemain - gol - menit - laga - tim - musim - li...,1.000000,False
14259,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1513941815-mantan-dirjen-perhubungan-laut-sege...,Komisi Pemberantasan Korupsi (KPK) sudah melim...,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,0,0_jakarta_indonesia_kpk_negara,"[jakarta, indonesia, kpk, negara, presiden, pe...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - indonesia - kpk - negara - presiden ...,0.719686,False
14260,Merdeka.com - Sebuah kabar gembira datang bagi...,1496440800-rangking-fifa-indonesia-naik-dua-pe...,Kabar gembira datang bagi sepakbola Indone...,Merdeka.com - Sebuah kabar gembira datang bagi...,1,1_pemain_gol_menit_laga,"[pemain, gol, menit, laga, tim, musim, liga, p...",[Tragis. Satu kata yang cukup mendeskripsikan ...,pemain - gol - menit - laga - tim - musim - li...,1.000000,False


In [None]:
def add_topic(example, idx):
    # if already have <tag>, return the example
    if "<tag>" in example["document"]:
        return example

    curr_topic = " ".join(topic_document_info["Representation"].values[idx])
    example["document"] = f"<tag> {curr_topic} <tag> {example['document']}"
    
    return example

# get the processor number and set the number of process
ds["train"] = ds["train"].map(add_topic, with_indices=True, num_proc=os.cpu_count())


Map (num_proc=8):   0%|          | 0/14262 [00:00<?, ? examples/s]

Unnamed: 0,document,id,summary
0,<tag> film lagu indonesia album jakarta orang ...,1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,<tag> indonesia jakarta orang anak salah negar...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,<tag> pemain gol menit laga tim musim liga per...,1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,<tag> jakarta indonesia kpk negara presiden pe...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,<tag> jakarta indonesia kpk negara presiden pe...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


In [None]:
print(json.dumps(ds["train"][:5], indent=4))

### Load Model

In [None]:
bart_model = AutoModelForSeq2SeqLM.from_pretrained("indobenchmark/indobart-v2")
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")

indonlg_tokenizer.add_special_tokens({"tagging": "<tag>"})

model = bart_model
tokenizer = indonlg_tokenizer

tokenizer

IndoNLGTokenizer(name_or_path='indobenchmark/indobart-v2', vocab_size=40004, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<tag>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	39942: AddedToken("<tag>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40003: AddedToken("<m

### Train Model

In [None]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_ds = ds.map(preprocess_function, batched=True)

# example first 5 data
for i in range(5):
    print("raw: " + tokenizer.decode(tokenized_ds["train"][i]["input_ids"]))
    print("token: ", tokenizer.convert_ids_to_tokens(tokenized_ds["train"][i]["input_ids"]))
    print("tokenized: " + " ".join(map(str, tokenized_ds["train"][i]["input_ids"])))
    print("")


Map:   0%|          | 0/14262 [00:00<?, ? examples/s]

Using auto half precision backend


In [None]:
# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        # use_stemmer=True
    )
    return result


# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

output_dir = f"{benc_dir}/models"

per_device_batch_size = 16  # 8 for low hardware spec

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5,  # hf example: 2e-5
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    weight_decay=0.01,
    # save_steps=1000,
    save_total_limit=1,
    num_train_epochs=3,  # hf example: 2
    fp16=True,  # comment this if using mps/apple sillicon chip (not supported)
    predict_with_generate=True,
    generation_max_length=80,
    log_level="info",
    logging_first_step=True,
    resume_from_checkpoint=True,
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer,  # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5467,0.532454,0.669272,0.58563,0.630227,0.659181
2,0.4183,0.520436,0.663243,0.578115,0.623583,0.654314
3,0.3293,0.52754,0.652068,0.564888,0.608441,0.642801


Saving model checkpoint to ./results/00-indobart-bertopic/checkpoint-1000
Configuration saved in ./results/00-indobart-bertopic/checkpoint-1000/config.json
Configuration saved in ./results/00-indobart-bertopic/checkpoint-1000/generation_config.json
Model weights saved in ./results/00-indobart-bertopic/checkpoint-1000/model.safetensors
tokenizer config file saved in ./results/00-indobart-bertopic/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/00-indobart-bertopic/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [results/00-indobart-bertopic/checkpoint-1000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8

TrainOutput(global_step=5349, training_loss=0.4522194787260973, metrics={'train_runtime': 1566.2985, 'train_samples_per_second': 27.317, 'train_steps_per_second': 3.415, 'total_flos': 1.58046914912256e+16, 'train_loss': 0.4522194787260973, 'epoch': 3.0})

### Predict Test Data and evaluate the score

In [None]:
# Generate predictions
test_predictions = trainer.predict(tokenized_ds["test"])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", json.dumps(rouge_scores, indent=4))

# save the results to a file 'rouge_scores.txt'
with open(os.path.join(benc_dir, "rouge_scores.txt"), "w") as f:
    f.write(str(rouge_scores))

The following columns in the test set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 3762
  Batch size = 8


ROUGE scores on the test set: {'rouge1': np.float64(0.6462466268578306), 'rouge2': np.float64(0.5592346798766463), 'rougeL': np.float64(0.6021837836547783), 'rougeLsum': np.float64(0.6362907678019264)}


### Pipeline summary

In [None]:
# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# # create table to show the result: document, summary, generated_summary
# df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
# for i in range(100):
#     document = ds['test'][i]['document']
#     summary = ds['test'][i]['summary']
#     generated_summary = summarizer(document, min_length=5, max_length=80)
#     df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)


# # Save the DataFrame to a CSV file
# df.to_csv(f'{benc_dir}/summarization_result.csv')
# df.to_json(f'{benc_dir}/summarization_result.json')

# df.head()

# generated_summary = summarizer(ds['test']['document'], min_length=5, max_length=80)



# data consists of: document, summary, generated_summary

# ds['test'] = ds['test'].map(lambda x: {'document': x['document'], 'summary': x['summary'], 'generated_summary': summarizer(x['document'], min_length=5, max_length=80)[0]['summary_text']})

# print(json.dumps(ds['test'], indent=4))



You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,document,summary,generated_summary
0,"Jakarta, CNN Indonesia - - Dilansir AFP, seora...",Eman Ahmed Abd El Aty memiliki berat badan men...,seorang warga mesir yang dipercaya sebagai wa...
1,Menteri Pertahanan Ryamizard Ryacudu menyambut...,Menteri Pertahanan Ryamizard Ryacudu menyambut...,pertahanan ryamizard ryacudu menyambut baik u...
2,"Jakarta, CNN Indonesia - - Meski sudah hampir ...",Rumah produksi film yang dibintangi Lindsay Lo...,"meski sudah hampir 12 tahun berlalu, film mea..."
3,"Usai melaksanakan ibadah haji, Eggi Sudjana ak...",Eggi Sudjana akhirnya mendatangi kantor Baresk...,eggi sudjana akhirnya mendatangi kantor bares...
4,Banyak cara untuk memberikan pengajaran kepada...,Game permainan Kartu Muslim. Menggunakan basis...,kartu berhologram khusus dipasarkan dalam sat...
