In [1]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset

import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from transformers import BertTokenizer, AutoModel

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer



In [2]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("using cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    # os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "1" # This is tracked as pytorch issue #98222
    print("using mps")
else:
    device = torch.device("cpu")
    print("using cpu")


using mps


In [3]:
# Specify the directory and file path
benc_directory = "benc_result/01-indobart-bertopic/"

# Create the benc_directory if it doesn't exist
os.makedirs(benc_directory, exist_ok=True)

### Load datasets

In [4]:
# ds = load_dataset('maryantocinn/indosum')
ds = load_dataset("./repo/SEACrowd/indosum/indosum.py")

# pandas dataframe
df_ds = {}
for key in ds.keys():
    df_ds[key] = ds[key].to_pandas()

# show first 5 data from the dataset in pandas like table
df_ds["train"].head()



Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


In [5]:
# check the length of the dataset
print("Train dataset length: ", len(ds["train"]))
print("Validation dataset length: ", len(ds["validation"]))
print("Test dataset length: ", len(ds["test"]))

Train dataset length:  14262
Validation dataset length:  750
Test dataset length:  3762


In [6]:
nltk.download("all", quiet=True)

True

### Topic Modelling

In [7]:
embedding_model = SentenceTransformer("LazarusNLP/all-indobert-base-v4")

stop_words = (
    stopwords.words("indonesian")
    + stopwords.words("english")
    + ["cnn", "dailysocial", "id"]
)
vectorizer_model = CountVectorizer(stop_words=stop_words, token_pattern="[^\W\d_]+")

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    nr_topics=10,
    verbose=True,
)

In [8]:
topics, probs = topic_model.fit_transform(ds["train"]["document"])

2024-11-04 12:53:47,346 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/446 [00:00<?, ?it/s]

2024-11-04 12:54:35,424 - BERTopic - Embedding - Completed ✓
2024-11-04 12:54:35,424 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-11-04 12:54:44,744 - BERTopic - Dimensionality - Completed ✓
2024-11-04 12:54:44,745 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-04 12:54:44,964 - BERTopic - Cluster - Completed ✓
2024-11-04 12:54:44,964 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-04 12:54:46,488 - BERTopic - Representation - Completed ✓
2024-11-04 12:54:46,491 - BERTopic - Topic reduction - Reducing number of topics
2024-11-04 12:54:47,993 - BERTopic - Topic reduction - Reduced number of topics from 160 to 10


In [None]:
topic_info = topic_model.get_topic_info()

# save to excel
topic_info.to_csv(benc_directory + "topic_info.csv")

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4714,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...",[Merdeka.com - Wakil Ketua Pansus Angket KPK M...
1,0,3161,0_pemain_gol_musim_laga,"[pemain, gol, musim, laga, menit, tim, liga, p...",[Dua gol dari Isco berkontribusi membawa Spany...
2,1,2124,1_jakarta_kpk_ketua_tersangka,"[jakarta, kpk, ketua, tersangka, indonesia, ja...","[Jakarta, CNN Indonesia - - Komisi Pemberantas..."
3,2,1268,2_film_lagu_album_konser,"[film, lagu, album, konser, indonesia, jakarta...","[Jakarta, CNN Indonesia - - Film horor Indones..."
4,3,1135,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, pengguna...",[Salah satu fitur unik yang dimiliki semua per...
5,4,833,4_makanan_tubuh_indonesia_orang,"[makanan, tubuh, indonesia, orang, memiliki, m...","[Jakarta, CNN Indonesia - - Saat musim hujan s..."
6,5,552,5_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, seran...","[Suara.com -. Pemerintah Israel berharap, nega..."
7,6,448,6_persen_indonesia_rp_pemerintah,"[persen, indonesia, rp, pemerintah, harga, gam...","[Jakarta, CNN Indonesia - - Tahun 2017 akan be..."
8,7,16,7_pasangan_hubungan_kado_orang,"[pasangan, hubungan, kado, orang, anak, berten...",[Banyak orang yang berpendapat langgenggnya se...
9,8,11,8_minyak_barel_mentah_opec,"[minyak, barel, mentah, opec, harga, produksi,...","[Jakarta, CNN Indonesia - - Harga minyak menta..."


In [None]:
topic_document_info = topic_model.get_document_info(ds["train"]["document"], df_ds["train"])

# save to excel
topic_document_info.to_csv(benc_directory + "topic_document_info.csv")

topic_document_info

Unnamed: 0,document,id,summary,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...",[Merdeka.com - Wakil Ketua Pansus Angket KPK M...,indonesia - jakarta - orang - negara - anak - ...,0.000000,False
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...,Selfie ialah salah satu tema terpanas di kalan...,3,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, pengguna...",[Salah satu fitur unik yang dimiliki semua per...,startup - teknologi - berita - inovasi - pengg...,0.541828,False
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",4,4_makanan_tubuh_indonesia_orang,"[makanan, tubuh, indonesia, orang, memiliki, m...","[Jakarta, CNN Indonesia - - Saat musim hujan s...",makanan - tubuh - indonesia - orang - memiliki...,0.766773,False
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...,Merdeka.com - Indonesia Corruption Watch (ICW)...,-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...",[Merdeka.com - Wakil Ketua Pansus Angket KPK M...,indonesia - jakarta - orang - negara - anak - ...,0.000000,False
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1,1_jakarta_kpk_ketua_tersangka,"[jakarta, kpk, ketua, tersangka, indonesia, ja...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - kpk - ketua - tersangka - indonesia ...,0.958790,False
...,...,...,...,...,...,...,...,...,...,...,...
14257,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",1497645345-as-kirimkan-peluncur-rudal-ke-suria...,Amerika Serikat dilaporkan telah mengirimkan s...,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",5,5_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, seran...","[Suara.com -. Pemerintah Israel berharap, nega...",trump - negara - presiden - israel - orang - s...,0.724635,False
14258,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",1495406700-borneo-bersyukur-tahan-persib-di-gbla,Borneo FC menahan imbang Persib Bandung pada l...,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...",[Merdeka.com - Wakil Ketua Pansus Angket KPK M...,indonesia - jakarta - orang - negara - anak - ...,0.000000,False
14259,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1513941815-mantan-dirjen-perhubungan-laut-sege...,Komisi Pemberantasan Korupsi (KPK) sudah melim...,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1,1_jakarta_kpk_ketua_tersangka,"[jakarta, kpk, ketua, tersangka, indonesia, ja...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",jakarta - kpk - ketua - tersangka - indonesia ...,1.000000,False
14260,Merdeka.com - Sebuah kabar gembira datang bagi...,1496440800-rangking-fifa-indonesia-naik-dua-pe...,Kabar gembira datang bagi sepakbola Indone...,Merdeka.com - Sebuah kabar gembira datang bagi...,-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...",[Merdeka.com - Wakil Ketua Pansus Angket KPK M...,indonesia - jakarta - orang - negara - anak - ...,0.000000,False


In [None]:
def add_topic(example, idx):
    # if already have <tag>, return the example
    if "<tag>" in example["document"]:
        return example

    curr_topic = " ".join(topic_document_info["Representation"].values[idx])
    example["document"] = f"<tag> {curr_topic} <tag> {example['document']}"
    
    return example

# get the processor number and set the number of process
ds["train"] = ds["train"].map(add_topic, with_indices=True, num_proc=os.cpu_count())

df_ds["train"] = ds["train"].to_pandas()

df_ds["train"].head()

Map (num_proc=16):   0%|          | 0/14262 [00:00<?, ? examples/s]

'<tag> indonesia jakarta orang negara anak salah kota kpk presiden rp <tag> Jakarta, CNN Indonesia - - Dokter Ryan Thamrin, yang terkenal lewat acara Dokter Oz Indonesia, meninggal dunia pada Jumat (4 / 8) dini hari. Dokter Lula Kamal yang merupakan selebriti sekaligus rekan kerja Ryan menyebut kawannya itu sudah sakit sejak setahun yang lalu. Lula menuturkan, sakit itu membuat Ryan mesti vakum dari semua kegiatannya, termasuk menjadi pembawa acara Dokter Oz Indonesia. Kondisi itu membuat Ryan harus kembali ke kampung halamannya di Pekanbaru, Riau untuk menjalani istirahat. " Setahu saya dia orangnya sehat, tapi tahun lalu saya dengar dia sakit. (Karena) sakitnya, ia langsung pulang ke Pekanbaru, jadi kami yang mau jenguk juga susah. Barangkali mau istirahat, ya betul juga, kalau di Jakarta susah isirahatnya, " kata Lula kepada CNNIndonesia.com, Jumat (4 / 8). Lula yang mengenal Ryan sejak sebelum aktif berkarier di televisi mengaku belum sempat membesuk Ryan lantaran lokasi yang jauh.

### Load Model

In [12]:
bart_model = AutoModelForSeq2SeqLM.from_pretrained("indobenchmark/indobart-v2")
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")

indonlg_tokenizer.add_special_tokens({"additional_special_tokens": ["<tag>"]})

model = bart_model
tokenizer = indonlg_tokenizer

tokenizer

IndoNLGTokenizer(name_or_path='indobenchmark/indobart-v2', vocab_size=40004, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<tag>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	39942: AddedToken("<tag>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40003: AddedToken("<m

### Train Model

In [13]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result


tokenized_ds = ds.map(preprocess_function, batched=True)

# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

per_device_batch_size = 8  # 8 for low hardware spec
output_dir = "./results/00-indobart"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5,  # hf example: 2e-5
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,  # hf example: 2
    fp16=True,  # comment this if using mps/apple sillicon chip (not supported)
    predict_with_generate=True,
    generation_max_length=80,
    log_level="info",
    logging_first_step=True,
    resume_from_checkpoint=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer,  # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/14262 [00:00<?, ? examples/s]

model.safetensors:  68%|######7   | 357M/526M [00:00<?, ?B/s]

Map:   0%|          | 0/3762 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
trainer.train()

### Predict Test Data and evaluate the score

In [None]:
# Generate predictions
test_predictions = trainer.predict(tokenized_ds["test"])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", rouge_scores)

# save the results to a file 'rouge_scores.txt'
with open(os.path.join(benc_directory, "rouge_scores.txt"), "w") as f:
    f.write(str(rouge_scores))

### Pipeline summary

In [None]:
# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# create table to show the result: document, summary, generated_summary
df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
for i in range(100):
    document = ds['test'][i]['document']
    summary = ds['test'][i]['summary']
    generated_summary = summarizer(document, min_length=5, max_length=80)
    df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)


# Save the DataFrame to a CSV file
df.to_csv(f'{benc_directory}/summarization_result.csv')
df.to_json(f'{benc_directory}/summarization_result.json')

df.head()