In [18]:
# # untar the file source
# import tarfile

# file_path = "dataset/liputan6_data.tar.gz"

# with tarfile.open(file_path, "r:gz") as tar:
#     print("Contents of the tar.gz archive:")
#     for member in tar.getmembers():
#         print(member.name)

# tf = tarfile.open(file_path)
# tf.extractall('dataset/')


### References
# - add data_rate_limit in jupyter notebook: https://stackoverflow.com/questions/43490495/how-to-set-notebookapp-iopub-data-rate-limit-and-others-notebookapp-settings-in
# - how to extract and unzip tar.gz file: https://www.youtube.com/watch?v=GCnSR1X_zwc

# About the Project
### 1. Objective
Mengembangkan sistem berbasis AI untuk meringkas dokumen dengan metode text understanding yang dapat memperoleh gagasan utama dari suatu berita.
Untuk mencapai target tersebut dapat menggunakan model BERT. Setelah pengembangan model, dapat dilakukan model evaluation dan penarikan kesimpulan. Setelah proyek telah selesai, yang tak kalah penting adalah tahap publish di repo Github.
### 2. Text Summarization
Metode komputasi yang digunakan untuk merangkum text menjadi ringkasan yang lebih singkat namun tetap mempertahankan poin kunci, frasa, dan konteks teks tersebut. Sehingga didapatkan teks dalam bentuk yang lebih ringkas dan padat.
### 3. Dataset
Dataset yang digunakan adalah kumpulan data summarization dari website Liputan6.
Setiap data memuat informasi berikut:
1. id
2. url
3. clean_article
4. clean_summary
5. extractive_summary

#### 3.1. about **id_liputan6** Dataset
Obtain 215,827 document-summary pairs to develop benchmark model for extractive and abstractive summarization with multilingual and monolingual BERT-based models. 

The dataset covers various topics and events from October 2000 to October 2010.
To summarize the paper covers some results as below:
1. release a large-scale Indonesia Summarization corpus with over 200K documents, which is larger magnitude than current dataset.
2. Statistics to show  that the summary in the dataset are reasonably abstractive and provide test partitions (standard and extreme abstractive test set).
3. Develop benchmark extractive and abstractive summarization models based on pre-trained BERT models
4. Conduct error analysis for future research on Indonesia Text summarization

The dataset covers various topics and events from October 2000 to October 2010. Those topics such as:
1. Politics
2. Business
3. Sport
4. Technology
5. Health
6. Entertainment


In [17]:
# import library

import torch
from datasets import load_dataset, DownloadConfig, load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BertTokenizer, EncoderDecoderModel, T5Tokenizer, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

import pandas as pd
import numpy as np

In [3]:
# load dataset and and use some configs
# canonical subset

DATA_DIR = "dataset/liputan6_data/"
download_config = DownloadConfig(delete_extracted=True)
dataset = load_dataset(
        "id_liputan6",
        "canonical",
        data_dir = DATA_DIR,
    download_config = download_config
        )

# xtreme subset
# dataset_xtreme = load_dataset(
#         "id_liputan6",
#         "xtreme",
#         data_dir = DATA_DIR,
#     download_config = download_config
#         )

In [4]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'],
        num_rows: 10972
    })
    test: Dataset({
        features: ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'],
        num_rows: 10972
    })
    train: Dataset({
        features: ['id', 'url', 'clean_article', 'clean_summary', 'extractive_summary'],
        num_rows: 193883
    })
})

In [5]:
sample_train_text = dataset['train'].shuffle(seed=42).select(range(2))
sample_eval_text = dataset['validation'].shuffle(seed=42).select(range(2))

## Combine Performance between pre-trained model **cahya/bert2gpt-indonesian-summarization** and **cahya/t5-base-indonesian-summarization-cased**

In [6]:
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_summaries(model, tokenizer, article):
    input_ids = tokenizer.encode(article, truncation=True, max_length = 512, return_tensors='pt')
    summary_ids = model.generate(
                input_ids,
                min_length=20,
                max_length=80, 
                num_beams=10,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True,
                no_repeat_ngram_size=2,
                use_cache=True,
                do_sample = True,
                temperature = 0.8,
                top_k = 50,
                top_p = 0.95
            )
    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary_text

def eval_perf_models(model, tokenizer):
    summaries = []
    for article in tqdm((sample_train_text["clean_article"]), total= len(sample_train_text["clean_article"])):
        summary_text = generate_summaries(model, tokenizer, article)
        summaries.append(summary_text)

    # calculate ROUGE
    rouge = load_metric("rouge")
    # rouge_metric =evaluate.load("precision")
    
    rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
    
    rouge.add_batch(predictions = summaries, references = sample_train_text["clean_summary"])
    score = rouge.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    df_eval = pd.DataFrame.from_dict(rouge_dict, orient="index", columns=["score"])

    return df_eval, summaries

### BERT2GPT

In [7]:
MODEL_PATH = 'cahya/bert2gpt-indonesian-summarization'

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained(MODEL_PATH).to(device)

df_eval_bert2gpt, summaries_bert2gpt = eval_perf_models(model = model, tokenizer=tokenizer)
df_eval_bert2gpt

100%|████████████████████████████████████████████████| 2/2 [01:16<00:00, 38.07s/it]
  rouge = load_metric("rouge")


Unnamed: 0,score
rouge1,0.485266
rouge2,0.320056
rougeL,0.467085
rougeLsum,0.467085


### T5

In [8]:
MODEL_PATH = 'cahya/t5-base-indonesian-summarization-cased'

tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)

df_eval_t5, summaries_t5 = eval_perf_models(model = model, tokenizer=tokenizer)
df_eval_t5

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
100%|████████████████████████████████████████████████| 2/2 [00:37<00:00, 18.62s/it]


Unnamed: 0,score
rouge1,0.471869
rouge2,0.309764
rougeL,0.454325
rougeLsum,0.454325


In [15]:
pd.set_option('display.max_colwidth', None)

df_compare_res = pd.DataFrame({
    "ORIGINAL TEXT": sample_train_text["clean_article"],
    "REFERENCE": sample_train_text["clean_summary"],
    "BERT2GPT": summaries_bert2gpt,
    "T5":summaries_t5
})

df_compare_res

Unnamed: 0,ORIGINAL TEXT,REFERENCE,BERT2GPT,T5
0,"Liputan6.com, Solok: Hingga Rabu (24/10), lebih dari 10 sumur warga di Jorong Tanjung Harapan Nagari Sungai Nanam, Solok, Sumatra Barat, telah tercemar belerang. Hasil penelitian tim Dinas Pertambangan dan Energi Sumbar, pencemaran diakibatkan keluarnya udara panas dari magma Gunung Talang pascagempa 12 September silam. Warga tak berani memakai air untuk kebutuhan sehari-hari karena khawatir keracunan. Dinas Pertambangan dan Energi Sumbar bersama tim Vulkanologi dan Mitigasi Bencana Geologi Bandung masih meneliti penyebab keluarnya hawa panas tersebut. Udara panas bersuhu 61 derajat Celcius dan berbau belerang itu keluar dari lantai rumah milik tiga warga. Saat ini, ketiga rumah itu sudah dikosongkan dan diberi garis polisi [baca: Gunung Talang Mengeluarkan Hawa Panas Beracun]. (YNI/Denni Risman dan Arset Kusnadi).","Air sumur warga Tanjung Harapan Nagari Sungai Nanam, Kabupaten Solok, Sumbar, mulai tercemar belerang. Diduga, pencemaran diakibatkan keluarnya udara panas dari magma Gunung Talang di kawasan itu.","sekitar 10 sumur warga di jorong tanjung harapan nagari sungai nanam, solok, sumbar, tercemar belerang. pencemaran diakibatkan keluarnya udara panas dari magma gunung talang pascagempa 12 september silam.","Lebih dari 10 sumur warga di Jorong Tanjung Harapan Nagari Sungai Nanam, Solok, Sumbar, telah tercemar belerang. Pencemaran diakibatkan keluarnya udara panas dari magma Gunung Talang pascagempa 12 September silam."
1,"Liputan6.com, Barcelona: Setelah mengalami musim yang buruk pada tahun pertamanya di Spanyol, Thierry Henry membuktikan dirinya masih bertaji. Di bawah pelatih Pep Guardiola sinarnya kembali terang. Titi, panggilan akrabnya, banyak membantu Barcelona menyabet tiga gelar juara bergengsi plus juara Piala Dunia Antarklub di Jepang, Desember silam. Namun begitu, di musim kompetisi sekarang sinar Henry kembali redup. Cedera merongrong membuatnya tidak banyak mendapat kesempatan bermain. Insiden handball ketika membela Prancis yang menyebabkan gol ke gawang Republik Irlandia di babak play-off Piala Dunia 2010 semakin menenggelamkan namanya. Itu semua bermuara pada kabar bahwa dirinya akan hengkang dari Nou Camp, markas Barcelona, setelah perhelatan PD 2010 di Afrika Selatan. Amerika Serikat menjadi persinggahan berikut yang diyakini menjadi pilihan Henry. Kabar itu kian santer kini dan bahkan dikabarkan harian olahraga Katalan, penyerang legendaris Arsenal ini sudah menandatangani prakontrak dengan klub Major League Soccer atau Liga Utama AS, New York Red Bulls. (DIM/Sport).","Thierry Henry dikabarkan sudah menandatangani prakontrak dengan klub di Amerika Serikat, New York Red Bulls.",thierry henry membuktikan dirinya masih bertaji. di bawah pelatih pep guardiola sinarnya kembali terang.,Thierry Henry membuktikan dirinya masih bertaji. Di bawah bimbingan Pep Guardiola sinarnya kembali redup.


# Conclusion
1. Beberapa variasi Rouge menunjukkan model BERT2GPT memiliki score sedikit lebih tinggi dibandingkan T5
2. Hasil summary dari BERT2GPT kurang lebih mirip dengan T5
3. Di beberapa hasil summary T5 menghasilkan summary yang lebih abstractive