In [1]:
import os, sys
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset

import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from transformers import BertTokenizer, AutoModel

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from repo.indobenchmark.toolkit.tokenization_indonlg import IndoNLGTokenizer



In [2]:
# check if cuda or mps available, if available, use one of them, otherwise use cpu

device = torch.device('cpu')

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('using cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    # os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "1" # This is tracked as pytorch issue #98222
    print('using mps')
else:
    device = torch.device('cpu')
    print('using cpu')


using mps


In [3]:
# Specify the directory and file path
benc_directory = 'benc_result/01-indobart-bertopic/'

# Create the benc_directory if it doesn't exist
os.makedirs(benc_directory, exist_ok=True)

### Load datasets

In [4]:
# ds = load_dataset('maryantocinn/indosum')
ds = load_dataset('./repo/SEACrowd/indosum/indosum.py')

# show first 5 data from the dataset in pandas like table
pd.DataFrame(ds['train'][:5]).head()

Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


In [5]:
# check the length of the dataset
print("Train dataset length: ", len(ds['train']))
print("Validation dataset length: ", len(ds['validation']))
print("Test dataset length: ", len(ds['test']))

Train dataset length:  14262
Validation dataset length:  750
Test dataset length:  3762


In [None]:
nltk.download("all", quiet=True)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/siagian/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/siagian/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/siagian/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/siagian/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/siagian/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

True

### Topic Modelling

In [None]:
embedding_model = SentenceTransformer("LazarusNLP/all-indobert-base-v4")

stop_words = stopwords.words('indonesian') + stopwords.words('english') + ['cnn', 'dailysocial', 'id']
vectorizer_model = CountVectorizer(stop_words=stop_words, token_pattern="[^\W\d_]+")

topic_model = BERTopic(embedding_model=embedding_model,vectorizer_model=vectorizer_model,nr_topics=10, verbose=True)

In [8]:
topics, probs = topic_model.fit_transform(ds['train']['document'])

2024-11-03 23:57:46,187 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/446 [00:00<?, ?it/s]

2024-11-03 23:58:35,146 - BERTopic - Embedding - Completed ✓
2024-11-03 23:58:35,146 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-11-03 23:58:44,261 - BERTopic - Dimensionality - Completed ✓
2024-11-03 23:58:44,262 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-03 23:58:44,474 - BERTopic - Cluster - Completed ✓
2024-11-03 23:58:44,474 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-03 23:58:45,987 - BERTopic - Representation - Completed ✓
2024-11-03 23:58:45,989 - BERTopic - Topic reduction - Reducing number of topics
2024-11-03 23:58:47,377 - BERTopic - Topic reduction - Reduced number of topics from 165 to 10


In [9]:
topic_info = topic_model.get_topic_info()

# save to excel
topic_info.to_excel(benc_directory + 'topic_info.xlsx')

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4668,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, kpk, anak,...",[Suara.com - Sepuluh orang tewas dalam unjuk r...
1,0,3344,0_pemain_gol_laga_musim,"[pemain, gol, laga, musim, menit, tim, liga, p...",[Barcelona sukses meraih kemenangan ketiganya ...
2,1,2284,1_jakarta_kpk_jalan_indonesia,"[jakarta, kpk, jalan, indonesia, polisi, ketua...",[Rimanews - Polisi dan TNI akan menjaga ketat ...
3,2,1281,2_film_lagu_album_konser,"[film, lagu, album, konser, indonesia, jakarta...","[Jakarta, CNN Indonesia - - Edwin, sutradara a..."
4,3,1110,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, pengguna...",[Hari ini DailySocial.id dan MRA (Mugi Rekso A...
5,4,587,4_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, ameri...","[Jakarta, CNN Indonesia - - Presiden Amerika S..."
6,5,395,5_indonesia_wisata_budaya_makanan,"[indonesia, wisata, budaya, makanan, pariwisat...","[Jakarta, CNN Indonesia - - Seni budaya Indone..."
7,6,370,6_persen_indonesia_rp_pemerintah,"[persen, indonesia, rp, pemerintah, harga, keu...","[Jakarta, CNN Indonesia - - Tahun 2017 akan be..."
8,7,211,7_tubuh_kanker_kulit_penelitian,"[tubuh, kanker, kulit, penelitian, tidur, oran...","[Buat sebagian orang, berolahraga dianggap cuk..."
9,8,12,8_minyak_barel_mentah_opec,"[minyak, barel, mentah, opec, harga, produksi,...","[Jakarta, CNN Indonesia - - Harga minyak menta..."


In [10]:
topic_document_info = topic_model.get_document_info(ds['train']['document'])

# save to excel
topic_document_info.to_excel(benc_directory + 'topic_document_info.xlsx')

topic_document_info

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, kpk, anak,...",[Suara.com - Sepuluh orang tewas dalam unjuk r...,indonesia - jakarta - orang - negara - kpk - a...,0.000000,False
1,Selfie ialah salah satu tema terpanas di kalan...,3,3_startup_teknologi_berita_inovasi,"[startup, teknologi, berita, inovasi, pengguna...",[Hari ini DailySocial.id dan MRA (Mugi Rekso A...,startup - teknologi - berita - inovasi - pengg...,0.562809,False
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",5,5_indonesia_wisata_budaya_makanan,"[indonesia, wisata, budaya, makanan, pariwisat...","[Jakarta, CNN Indonesia - - Seni budaya Indone...",indonesia - wisata - budaya - makanan - pariwi...,1.000000,False
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1,1_jakarta_kpk_jalan_indonesia,"[jakarta, kpk, jalan, indonesia, polisi, ketua...",[Rimanews - Polisi dan TNI akan menjaga ketat ...,jakarta - kpk - jalan - indonesia - polisi - k...,0.811099,False
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,-1,-1_indonesia_jakarta_orang_negara,"[indonesia, jakarta, orang, negara, kpk, anak,...",[Suara.com - Sepuluh orang tewas dalam unjuk r...,indonesia - jakarta - orang - negara - kpk - a...,0.000000,False
...,...,...,...,...,...,...,...,...
14257,"Jakarta, CNN Indonesia - - Amerika Serikat dil...",4,4_trump_negara_presiden_israel,"[trump, negara, presiden, israel, orang, ameri...","[Jakarta, CNN Indonesia - - Presiden Amerika S...",trump - negara - presiden - israel - orang - a...,0.470385,False
14258,"Bandung, CNN Indonesia - - Borneo FC berhasil ...",0,0_pemain_gol_laga_musim,"[pemain, gol, laga, musim, menit, tim, liga, p...",[Barcelona sukses meraih kemenangan ketiganya ...,pemain - gol - laga - musim - menit - tim - li...,0.513807,False
14259,JAKARTA (Pos Kota) – Komisi Pemberantasan Koru...,1,1_jakarta_kpk_jalan_indonesia,"[jakarta, kpk, jalan, indonesia, polisi, ketua...",[Rimanews - Polisi dan TNI akan menjaga ketat ...,jakarta - kpk - jalan - indonesia - polisi - k...,1.000000,False
14260,Merdeka.com - Sebuah kabar gembira datang bagi...,0,0_pemain_gol_laga_musim,"[pemain, gol, laga, musim, menit, tim, liga, p...",[Barcelona sukses meraih kemenangan ketiganya ...,pemain - gol - laga - musim - menit - tim - li...,1.000000,False


### Load Model

In [11]:
%%script true

# bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart-v2')

bart_model = AutoModelForSeq2SeqLM.from_pretrained('indobenchmark/indobart-v2')
indonlg_tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart-v2')

model = bart_model
tokenizer = indonlg_tokenizer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Train Model

In [12]:
%%script true

# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

tokenized_ds = ds.map(preprocess_function, batched=True)

# Load pretrained model and evaluate model after each epoch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

per_device_batch_size = 8 # 8 for low hardware spec
output_dir = "./results/00-indobart"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=3.75e-5, # hf example: 2e-5
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    weight_decay=0.01,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=3, # hf example: 2
    fp16=True, # comment this if using mps/apple sillicon chip (not supported)
    predict_with_generate=True,
    generation_max_length=80,
    log_level="info",
    logging_first_step=True,
    resume_from_checkpoint=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    processing_class=tokenizer, # FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Seq2SeqTrainer.__init__`.
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
%%script true

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Predict Test Data and evaluate the score

In [14]:
%%script true

# Generate predictions
test_predictions = trainer.predict(tokenized_ds['test'])

# Get the predictions and labels from the result
preds = test_predictions.predictions
labels = test_predictions.label_ids

# Evaluate using the compute_metrics function
rouge_scores = compute_metrics((preds, labels))

# Print the ROUGE scores
print("ROUGE scores on the test set:", rouge_scores)

# save the results to a file 'rouge_scores.txt'
with open(os.path.join(benc_directory, 'rouge_scores.txt'), 'w') as f:
    f.write(str(rouge_scores))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Pipeline summary

In [15]:
%%script true

# get device
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# create table to show the result: document, summary, generated_summary
df = pd.DataFrame(columns=['document', 'summary', 'generated_summary'])
for i in range(100):
    document = ds['test'][i]['document']
    summary = ds['test'][i]['summary']
    generated_summary = summarizer(document, min_length=5, max_length=80)
    df = pd.concat([df, pd.DataFrame([[document, summary, generated_summary[0]['summary_text']]], columns=['document', 'summary', 'generated_summary'])], ignore_index=True)


# Save the DataFrame to a CSV file
df.to_csv(f'{benc_directory}/summarization_result.csv')
df.to_json(f'{benc_directory}/summarization_result.json')

df.head()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
