In [1]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset

import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from transformers import BertTokenizer, AutoModel

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer

In [2]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("using cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    # os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "1" # This is tracked as pytorch issue #98222
    print("using mps")
else:
    device = torch.device("cpu")
    print("using cpu")


using cuda


In [3]:
# Specify the directory and file path
benc_directory = "benc_result/01-indobart-bertopic/"

# Create the benc_directory if it doesn't exist
os.makedirs(benc_directory, exist_ok=True)

### Load datasets

In [4]:
# ds = load_dataset('maryantocinn/indosum')
ds = load_dataset("./repo/SEACrowd/indosum/indosum.py")

# pandas dataframe
df_ds = {}
for key in ds.keys():
    df_ds[key] = ds[key].to_pandas()

# show first 5 data from the dataset in pandas like table
df_ds["train"].head()



Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


In [5]:
# check the length of the dataset
print("Train dataset length: ", len(ds["train"]))
print("Validation dataset length: ", len(ds["validation"]))
print("Test dataset length: ", len(ds["test"]))

Train dataset length:  14262
Validation dataset length:  750
Test dataset length:  3762


In [6]:
nltk.download("all", quiet=True)

True

### Topic Modelling

In [7]:
embedding_model = SentenceTransformer("LazarusNLP/all-indobert-base-v4")

stop_words = (
    stopwords.words("indonesian")
    + stopwords.words("english")
    + ["cnn", "dailysocial", "id"]
)
vectorizer_model = CountVectorizer(stop_words=stop_words, token_pattern="[^\W\d_]+")

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    nr_topics=10,
    verbose=True,
)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.37k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/709k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [8]:
topics, probs = topic_model.fit_transform(ds["train"]["document"])

2024-11-04 06:13:54,636 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/446 [00:00<?, ?it/s]

2024-11-04 06:14:43,531 - BERTopic - Embedding - Completed ✓
2024-11-04 06:14:43,533 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-04 06:15:28,358 - BERTopic - Dimensionality - Completed ✓
2024-11-04 06:15:28,361 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-04 06:15:29,215 - BERTopic - Cluster - Completed ✓
2024-11-04 06:15:29,216 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-04 06:15:36,248 - BERTopic - Representation - Completed ✓
2024-11-04 06:15:36,255 - BERTopic - Topic reduction - Reducing number of topics
2024-11-04 06:15:41,550 - BERTopic - Topic reduction - Reduced number of topics from 163 to 10


In [9]:
topic_info = topic_model.get_topic_info()

# save to excel
topic_info.to_csv(benc_directory + "topic_info.csv")

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4938,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...","[Jakarta, CNN Indonesia - - Komisi Pemberantas..."
1,0,3207,0_pemain_gol_musim_laga,"[pemain, gol, musim, laga, menit, tim, liga, p...",[Dua gol dari Isco berkontribusi membawa Spany...
2,1,2295,1_jakarta_kpk_indonesia_jalan,"[jakarta, kpk, indonesia, jalan, partai, ketua...","[Jakarta (ANTARA News) - Bimanesh Sutarjo, dok..."
3,2,1175,2_film_lagu_album_jakarta,"[film, lagu, album, jakarta, indonesia, orang,...","[Jakarta, CNN Indonesia - - Film horor Indones..."
4,3,1038,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, fitur, i...","[Sebelum tips ini, kita sudah pernah memberika..."
5,4,620,4_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, ameri...",[Presiden Amerika Serikat Donald Trump secara ...
6,5,411,5_indonesia_makanan_pariwisata_kopi,"[indonesia, makanan, pariwisata, kopi, wisata,...","[Jakarta, CNN Indonesia - - Festival Seni Buda..."
7,6,353,6_tubuh_kanker_makanan_penyakit,"[tubuh, kanker, makanan, penyakit, kulit, pene...",[Merdeka.com - Musim hujan telah datang dan in...
8,7,172,7_persen_rp_harga_pajak,"[persen, rp, harga, pajak, indonesia, minyak, ...","[Jakarta, CNN Indonesia - - Menteri Koordinato..."
9,8,53,8_pendidikan_sekolah_guru_ui,"[pendidikan, sekolah, guru, ui, muhadjir, perg...","[Jakarta, CNN Indonesia - - Menteri Pendidikan..."


In [10]:
topic_document_info = topic_model.get_document_info(ds["train"]["document"], df_ds["train"])

# save to excel
topic_document_info.to_csv(benc_directory + "topic_document_info.csv")

topic_document_info

Unnamed: 0,document,id,summary,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",indonesia - jakarta - orang - negara - anak - ...,0.000000,False
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...,Selfie ialah salah satu tema terpanas di kalan...,3,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, fitur, i...","[Sebelum tips ini, kita sudah pernah memberika...",startup - teknologi - berita - inovasi - fitur...,0.502241,False
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",5,5_indonesia_makanan_pariwisata_kopi,"[indonesia, makanan, pariwisata, kopi, wisata,...","[Jakarta, CNN Indonesia - - Festival Seni Buda...",indonesia - makanan - pariwisata - kopi - wisa...,0.931050,False
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...,Merdeka.com - Indonesia Corruption Watch (ICW)...,-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",indonesia - jakarta - orang - negara - anak - ...,0.000000,False
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1,1_jakarta_kpk_indonesia_jalan,"[jakarta, kpk, indonesia, jalan, partai, ketua...","[Jakarta (ANTARA News) - Bimanesh Sutarjo, dok...",jakarta - kpk - indonesia - jalan - partai - k...,0.995418,False
...,...,...,...,...,...,...,...,...,...,...,...
14257,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",1497645345-as-kirimkan-peluncur-rudal-ke-suria...,Amerika Serikat dilaporkan telah mengirimkan s...,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",4,4_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, ameri...",[Presiden Amerika Serikat Donald Trump secara ...,trump - negara - presiden - israel - orang - a...,0.403283,False
14258,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",1495406700-borneo-bersyukur-tahan-persib-di-gbla,Borneo FC menahan imbang Persib Bandung pada l...,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",indonesia - jakarta - orang - negara - anak - ...,0.000000,False
14259,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1513941815-mantan-dirjen-perhubungan-laut-sege...,Komisi Pemberantasan Korupsi (KPK) sudah melim...,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1,1_jakarta_kpk_indonesia_jalan,"[jakarta, kpk, indonesia, jalan, partai, ketua...","[Jakarta (ANTARA News) - Bimanesh Sutarjo, dok...",jakarta - kpk - indonesia - jalan - partai - k...,0.781011,False
14260,Merdeka.com - Sebuah kabar gembira datang bagi...,1496440800-rangking-fifa-indonesia-naik-dua-pe...,Kabar gembira datang bagi sepakbola Indone...,Merdeka.com - Sebuah kabar gembira datang bagi...,-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, anak, sala...","[Jakarta, CNN Indonesia - - Komisi Pemberantas...",indonesia - jakarta - orang - negara - anak - ...,0.000000,False


In [11]:
def add_topic(example, idx):
    # if already have <tag>, return the example
    if "<tag>" in example["document"]:
        return example

    curr_topic = " ".join(topic_document_info["Representation"].values[idx])
    example["document"] = f"<tag> {curr_topic} <tag> {example['document']}"
    
    return example

# get the processor number and set the number of process
ds["train"] = ds["train"].map(add_topic, with_indices=True, num_proc=os.cpu_count())

df_ds["train"] = ds["train"].to_pandas()

df_ds["train"].head()

Map (num_proc=8):   0%|          | 0/14262 [00:00<?, ? examples/s]

Unnamed: 0,document,id,summary
0,<tag> indonesia jakarta orang negara anak sala...,1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,<tag> startup teknologi berita inovasi fitur i...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,<tag> indonesia makanan pariwisata kopi wisata...,1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,<tag> indonesia jakarta orang negara anak sala...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,<tag> jakarta kpk indonesia jalan partai ketua...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


### Load Model

In [12]:
bart_model = AutoModelForSeq2SeqLM.from_pretrained("indobenchmark/indobart-v2")
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")

indonlg_tokenizer.add_special_tokens({"additional_special_tokens": ["<tag>"]})

model = bart_model
tokenizer = indonlg_tokenizer

tokenizer

IndoNLGTokenizer(name_or_path='indobenchmark/indobart-v2', vocab_size=40004, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['<tag>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	39942: AddedToken("<tag>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	40003: AddedToken("<m

### Train Model

In [13]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result


tokenized_ds = ds.map(preprocess_function, batched=True)

# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

per_device_batch_size = 8  # 8 for low hardware spec
output_dir = "./results/00-indobart"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5,  # hf example: 2e-5
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3,  # hf example: 2
    fp16=True,  # comment this if using mps/apple sillicon chip (not supported)
    predict_with_generate=True,
    generation_max_length=80,
    log_level="info",
    logging_first_step=True,
    resume_from_checkpoint=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer,  # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Map:   0%|          | 0/14262 [00:00<?, ? examples/s]

Map:   0%|          | 0/3762 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Using auto half precision backend


In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,262
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,349
  Number of trainable parameters = 131,543,040


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5451,0.533041,0.666536,0.583236,0.626986,0.656955
2,0.4167,0.51326,0.669243,0.58644,0.629475,0.660222
3,0.3286,0.522803,0.660575,0.576017,0.619246,0.651756


Saving model checkpoint to ./results/00-indobart/checkpoint-1000
Configuration saved in ./results/00-indobart/checkpoint-1000/config.json
Configuration saved in ./results/00-indobart/checkpoint-1000/generation_config.json
Model weights saved in ./results/00-indobart/checkpoint-1000/model.safetensors
tokenizer config file saved in ./results/00-indobart/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/00-indobart/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [results/00-indobart/checkpoint-1000] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 750
  Batch size = 8
Saving model checkpoint to ./results/00-indobart/checkpoint-20

TrainOutput(global_step=5349, training_loss=0.45139215052250414, metrics={'train_runtime': 1585.1852, 'train_samples_per_second': 26.991, 'train_steps_per_second': 3.374, 'total_flos': 1.5804686727512064e+16, 'train_loss': 0.45139215052250414, 'epoch': 3.0})

### Predict Test Data and evaluate the score

In [15]:
# Generate predictions
test_predictions = trainer.predict(tokenized_ds["test"])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", rouge_scores)

# save the results to a file 'rouge_scores.txt'
with open(os.path.join(benc_directory, "rouge_scores.txt"), "w") as f:
    f.write(str(rouge_scores))

The following columns in the test set don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 3762
  Batch size = 8


ROUGE scores on the test set: {'rouge1': np.float64(0.6492739538349439), 'rouge2': np.float64(0.5628012293066681), 'rougeL': np.float64(0.6066622052994133), 'rougeLsum': np.float64(0.6391191735873272)}


### Pipeline summary

In [16]:
# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# create table to show the result: document, summary, generated_summary
df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
for i in range(100):
    document = ds['test'][i]['document']
    summary = ds['test'][i]['summary']
    generated_summary = summarizer(document, min_length=5, max_length=80)
    df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)


# Save the DataFrame to a CSV file
df.to_csv(f'{benc_directory}/summarization_result.csv')
df.to_json(f'{benc_directory}/summarization_result.json')

df.head()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,document,summary,generated_summary
0,"Jakarta, CNN Indonesia - - Dilansir AFP, seora...",Eman Ahmed Abd El Aty memiliki berat badan men...,seorang warga mesir yang dipercaya sebagai wa...
1,Menteri Pertahanan Ryamizard Ryacudu menyambut...,Menteri Pertahanan Ryamizard Ryacudu menyambut...,pesawat tanpa awak itu rencananya digunakan u...
2,"Jakarta, CNN Indonesia - - Meski sudah hampir ...",Rumah produksi film yang dibintangi Lindsay Lo...,"meski sudah hampir 12 tahun berlalu, film mea..."
3,"Usai melaksanakan ibadah haji, Eggi Sudjana ak...",Eggi Sudjana akhirnya mendatangi kantor Baresk...,eggi sudjana akhirnya mendatangi kantor bares...
4,Banyak cara untuk memberikan pengajaran kepada...,Game permainan Kartu Muslim. Menggunakan basis...,permainan kartu tersebut memberikan pengajara...
