In [None]:
%load_ext autoreload
%autoreload 2

import os
import json

from src.datasets import IndoSum
from src.common import get_device
from src.indobart.base import get_model, get_tokenizer, get_config

import stanza
import torch
import spacy
from spacy.tokens import Doc
from spacy import displacy

import numpy as np
import nltk
import evaluate
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from transformers import BartModel, BartConfig
from transformers.models.bart.modeling_bart import BartAttention
import torch.nn as nn


from accelerate import Accelerator

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
accelerator = Accelerator()
device = accelerator.device
device

device(type='cuda')

In [12]:
# Download and set up Stanza's Indonesian NLP model
stanza.download("id")
nlp = stanza.Pipeline("id", processors="tokenize,mwt,pos,lemma,depparse")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-14 16:35:55 INFO: Downloaded file to /home/paperspace/stanza_resources/resources.json
2024-11-14 16:35:55 INFO: Downloading default packages for language: id (Indonesian) ...


2024-11-14 16:35:56 INFO: File exists: /home/paperspace/stanza_resources/id/default.zip
2024-11-14 16:36:05 INFO: Finished downloading models and saved to /home/paperspace/stanza_resources
2024-11-14 16:36:05 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-14 16:36:05 INFO: Downloaded file to /home/paperspace/stanza_resources/resources.json
2024-11-14 16:36:06 INFO: Loading these models for language: id (Indonesian):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| pos       | gsd_charlm   |
| lemma     | gsd_nocharlm |
| depparse  | gsd_charlm   |

2024-11-14 16:36:06 INFO: Using device: cuda
2024-11-14 16:36:06 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-14 16:36:06 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-14 16:36:06 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-14 16:36:07 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-14 16:36:07 INFO: 

### Data Loading

In [13]:
indosum = IndoSum()
indosum.ds

DatasetDict({
    train: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 14262
    })
    test: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 3762
    })
    validation: Dataset({
        features: ['document', 'id', 'summary'],
        num_rows: 750
    })
})

In [14]:
indosum.to_pd("train").head()

Unnamed: 0,document,id,summary
0,"Jakarta, CNN Indonesia - - Dokter Ryan Thamrin...",1501893029-lula-kamal-dokter-ryan-thamrin-saki...,Dokter Lula Kamal yang merupakan selebriti sek...
1,Selfie ialah salah satu tema terpanas di kalan...,1509072914-dua-smartphone-zenfone-baru-tawarka...,Asus memperkenalkan ZenFone generasi keempat...
2,"Jakarta, CNN Indonesia - - Dinas Pariwisata Pr...",1510613677-songsong-visit-2020-bengkulu-perkua...,Dinas Pariwisata Provinsi Bengkulu kembali men...
3,Merdeka.com - Indonesia Corruption Watch (ICW)...,1502706803-icw-ada-kejanggalan-atas-tewasnya-s...,Indonesia Corruption Watch (ICW) meminta Komis...
4,Merdeka.com - Presiden Joko Widodo (Jokowi) me...,1503039338-pembagian-sepeda-usai-upacara-penur...,Jokowi memimpin upacara penurunan bendera. Usa...


### Dependency Parsing

In [7]:
# Convert Stanza Document to spaCy Doc for Visualization
def stanza_to_spacy(doc):
    """
    Converts a Stanza-parsed document to a spaCy Doc object for dependency visualization.
    Handles root words and ensures valid head indices.
    """
    # Flatten all sentences' words, heads, and dependency relations from Stanza
    words = [word.text for sentence in doc.sentences for word in sentence.words]
    deps = [word.deprel for sentence in doc.sentences for word in sentence.words]

    # Stanza `head` is 1-based (1-indexed); convert to 0-based for spaCy and handle roots
    heads = []
    for sentence in doc.sentences:
        for word in sentence.words:
            # If the word is root, set head to its own index
            if word.head == 0:
                heads.append(word.id - 1)
            else:
                heads.append(word.head - 1)  # Convert to 0-based indexing

    # Create a spaCy Doc object using the extracted information
    spacy_doc = Doc(spacy.blank("id").vocab, words=words)
    for token, head, dep in zip(spacy_doc, heads, deps):
        token.dep_ = dep
        token.head = spacy_doc[head]

    return spacy_doc

# Build Dependency Information Matrices (DIM) for each sentence in a document
def build_dependency_matrices(document):
    """
    Build a list of Dependency Information Matrices (DIMs) for each sentence in the document.
    Each matrix represents dependency relations within a sentence.
    """
    doc = nlp(document)  # Process the document with Stanza
    matrices = []  # List to hold the DIM for each sentence in the document
    sentence_texts = []  # List to hold the raw sentences

    # Iterate over each sentence in the processed Stanza document
    for sentence in doc.sentences:
        n = len(sentence.words)  # Number of words in the sentence
        matrix = np.zeros((n, n))  # Initialize an n x n matrix with zeros

        # Populate the matrix with dependency information
        for word in sentence.words:
            if word.head > 0:  # If head is not root (head == 0 indicates root in Stanza)
                # Set a 1 for both directions (i.e., word -> head and head -> word)
                matrix[word.id - 1, word.head - 1] = 1  # word.id and word.head are 1-based indices
                matrix[word.head - 1, word.id - 1] = 1  # Make the matrix symmetric
        matrices.append(torch.tensor(matrix, dtype=torch.float32))  # Convert matrix to tensor and add to list
        sentence_texts.append(sentence.text)  # Add the raw sentence text to the list

    return list(zip(matrices, sentence_texts)), doc

# Parse and Visualize Dependencies
def visualize_dependencies(doc):
    """
    Visualizes dependencies from the Stanza-parsed document using spaCy's displacy.
    """
    # Convert Stanza output to spaCy format for visualization
    spacy_doc = stanza_to_spacy(doc)
    
    # Visualize dependencies using spaCy's displacy
    displacy.render(spacy_doc, style="dep", jupyter=True)  # Use jupyter=True in notebooks

#### Data Exploration

In [15]:
sample_doc = indosum.ds["validation"][0]['document']
sample_doc

'Ketua MPR Zulkifli Hasan menyesalkan kisruh yang terjadi antara pelaku sarana transportasi online dan tradisional. Zulkifli menyarankan adanya pertemuan bersama antara pemerintah, pelaku transportasi online dan transportasi tradisional demi meredam kisruh yang masih belum terselesaikan. Zulkifli menilai aturan yang dikeluarkan pemerintah seharusnya tidak hanya membahas tarif tapi juga mekanisme yang dapat menguntungkan semua pihak, baik pelaku transportasi online maupun tradisional. " Tidak hanya tarif tapi apa saja harus diatur. Dipanggil keduanya untuk berbicara masing-masing, musyawarah, duduk bareng kemudian dibuat aturan yang saling menguntungkan. Kan bisa saling melengkapi, negara lain bisa masa kita enggak bisa, " ucap Zulkifli di Gedung DPR, Senayan, Jakarta Pusat, Senin (27 / 3). Baca juga: Setya Novanto: Jangan Sampai Kisruh Taksi dan Ojek Online Jadi Besar Ketua Umum PAN menambahkan bahwa hal ini harus diatur karena menyangkut mata pencaharian dari masyarakat itu sendiri. M

In [17]:
sample_dim_sentence_pairs, sample_stanza_doc = build_dependency_matrices(sample_doc)
print("Dependency Information Matrices for each sentence:")
for i, (matrix, sentence) in enumerate(sample_dim_sentence_pairs, 1):
    print(f"Raw Sentence {i}:\n{sentence}")
    print(f"DIM Sentence {i}:\n{matrix}\n")

Dependency Information Matrices for each sentence:
Raw Sentence 1:
Ketua MPR Zulkifli Hasan menyesalkan kisruh yang terjadi antara pelaku sarana transportasi online dan tradisional.
DIM Sentence 1:
tensor([[0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0

### Linguistic-Guided Attention in the Encoder

In [None]:
# Define a custom attention layer for the encoder to use the DIM during the attention calculation
class EncoderLinguisticGuidedAttention(BartAttention):
    def __init__(self, embed_dim, num_heads, dropout=0.0, alpha=1.0):
        super().__init__(embed_dim, num_heads, dropout, is_decoder=False)
        self.alpha = alpha

    def forward(self, hidden_states, dim_matrix, **kwargs):
        attn_output, attn_weights = super().forward(hidden_states, **kwargs)
        
        # Apply linguistic-guided attention
        dim_matrix = dim_matrix.to(attn_weights.device)
        lg_attn_weights = (self.alpha * dim_matrix + torch.eye(dim_matrix.size(-1), device=dim_matrix.device)) * attn_weights
        attn_output = torch.matmul(lg_attn_weights, hidden_states)
        return attn_output, lg_attn_weights

In [None]:
# Now, replace the encoder's attention mechanism with EncoderLinguisticGuidedAttention.
class CustomIndoBARTWithLGA(BartModel):
    def __init__(self, config: BartConfig, alpha=1.0):
        super().__init__(config)
        
        # Modify encoder layers to use linguistic-guided attention
        for layer in self.model.encoder.layers:
            layer.self_attn = EncoderLinguisticGuidedAttention(
                config.d_model, config.encoder_attention_heads, config.attention_dropout, alpha=alpha
            )

### Load Model

In [None]:
tokenizer = get_tokenizer()
config = get_config()
model = CustomIndoBARTWithLGA(config, alpha=1.0)
model.load_state_dict(get_model().state_dict(), strict=False)

In [None]:
model

In [None]:
tokenizer

### Train Model

In [None]:
# Setup evaluation
nltk.download("punkt_tab", quiet=True)
metric = evaluate.load("rouge")

#### Preparation

In [None]:
# Update data collator to include DIM
class CustomDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features):
        batch = super().__call__(features)
        
        # Flatten and pad DIMs across documents in the batch for consistent dimensions
        max_sentences = max(len(f["dim_matrices"]) for f in features)
        max_tokens = max(matrix.size(0) for f in features for matrix in f["dim_matrices"])

        # Initialize padded tensor for batched DIMs
        dim_matrices_padded = torch.zeros((len(features), max_sentences, max_tokens, max_tokens))

        for i, feature in enumerate(features):
            for j, matrix in enumerate(feature["dim_matrices"]):
                dim_matrices_padded[i, j, :matrix.size(0), :matrix.size(1)] = matrix

        batch["dim_matrices"] = dim_matrices_padded
        return batch

data_collator = CustomDataCollator(tokenizer=tokenizer, model=model)

In [None]:
# Prepare and tokenize dataset
def preprocess_function(examples):
    model_inputs = tokenizer(examples["document"], max_length=768, truncation=True)
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    # Process DIMs for each document
    dim_sentence_pairs, _ = build_dependency_matrices(examples["document"])
    dim_matrices, _ = zip(*dim_sentence_pairs)  # Separate DIMs from text
    model_inputs["dim_matrices"] = dim_matrices

    return model_inputs

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return result

tokenized_ds = indosum.ds.map(preprocess_function, batched=True)

def train_model(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length):
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir + "/checkpoint",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_batch_size,
        per_device_eval_batch_size=per_device_batch_size,
        weight_decay=0.01,
        num_train_epochs=num_train_epochs,
        fp16=True,
        predict_with_generate=True,
        generation_max_length=generation_max_length,
        log_level="info",
        logging_first_step=True,
        logging_dir=output_dir + "/logs",
        resume_from_checkpoint=True,
        save_total_limit=1,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["validation"],
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    return trainer
    
def evaluate_model(trainer):
    eval_results = trainer.evaluate(eval_dataset=tokenized_ds["test"])
    return eval_results


def train_and_evaluate(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length):
    trainer = train_model(output_dir, per_device_batch_size, learning_rate, num_train_epochs, generation_max_length)
    eval_results = evaluate_model(trainer)
    
    return trainer, eval_results


#### Training & Evaluation

Try multiple generation max length with the rest parameters fixed.
Observes the best score and the corresponding generation max length.

In [None]:
experiments = []

for i in range(1, 6):
    generation_max_length = 50 + i * 10
    experiments.append({
        "output_dir": f"./results/00-indobart-dp/0{i}",
        "per_device_batch_size": 8,
        "learning_rate": 3.75e-5,
        "num_train_epochs": 3,
        "generation_max_length": generation_max_length
    })

for exp in experiments:
    os.makedirs(exp["output_dir"], exist_ok=True)
    
    trainer, eval_results = train_and_evaluate(
        exp["output_dir"],
        exp["per_device_batch_size"],
        exp["learning_rate"],
        exp["num_train_epochs"],
        exp["generation_max_length"]
    )
    
    # print params and the results
    print("=== Results for experiment ===")
    print("-- Params --") 
    print(json.dumps(exp, indent=4))
    print("-- Eval results --")
    print(json.dumps(eval_results, indent=4))
    
    # save mapping between params and results
    with open(exp["output_dir"] + "/params.json", "w") as f:
        json.dump(exp, f)
    
    with open(exp["output_dir"] + "/eval_results.json", "w") as f:
        json.dump(eval_results, f)

