In [None]:
%%capture
!pip install datasets
!pip install transformers
!pip install rouge_score
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

import datasets
import transformers

### **Data Preprocessing**


Pada kali ini, saya akan menggunakan BERT yang memiliki basis yaitu transformer untuk membuat sebuah AI yang dapat melakukan ringkasan pada teks. 

kenapa tidak menggunakan RNN? dikarenakan tidak bisa mengingat kata-kata sebelumnya.

kenapa tidak menggunakan LSTM? dikarenakan sangat lambat untuk di train

Transformer menjawab permasalahan tersebut.

Normalnya, di arsitektur transformer ada yang namanya encoder dan decoder. Namun BERT hanya menggunakan encoder saja.

BERT base merupakan model yang cukup ringan dengan hanya 110 juta parameter dan 12 encoders layers dibandingkan BERT large yang memiliki 340 juta parameter dan 24 encoders layers.

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
val_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:10%]")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.




In [None]:
batch_size=16
encoder_max_length=512 #input max 512
decoder_max_length=128 #output

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  #ignore padding tokkens
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["article", "highlights", "id"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["article", "highlights", "id"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/1337 [00:00<?, ? examples/s]

### **Warm-starting the Encoder-Decoder Model**

In [None]:
from transformers import EncoderDecoderModel

bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

### **Fine-Tuning Warm-Started Encoder-Decoder Models**

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = datasets.load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
#set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=1000,  
    save_steps=500,  
    eval_steps=8000,  
    warmup_steps=2000, 
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()
trainer.save_model()


### **Evaluation**

Awesome, we finished training our dummy model. Let's now evaluated the model on the test data. We make use of the dataset's handy `.map()` function to generate a summary of each sample of the test data.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import datasets
from transformers import BertTokenizer, EncoderDecoderModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_pretrained("/content/drive/MyDrive/NLP-BERT")
model.to("cuda")

test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test")

# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
test_data = test_data.select(range(16))

batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])

pred_str = results["pred"]
label_str = results["highlights"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

In [None]:
def generate_summary2(text):
  inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
  input_ids = inputs.input_ids.to("cuda")
  attention_mask = inputs.attention_mask.to("cuda")

  output = model.generate(input_ids, attention_mask=attention_mask)
  output_str = tokenizer.batch_decode(output, skip_special_tokens=True)

  return output_str[0]

generate_summary2("The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.")

'the eiffel tower surpassed the washington monument to become the tallest structure in the world. it was the first structure to reach a height of 300 metres in paris in 1930. it is now taller than the chrysler building by 5. 2 metres ( 17 ft ) and is the second tallest free - standing structure in paris.'

In [None]:
generate_summary2("The United States and Papua New Guinea have signed a new bilateral defense cooperation agreement – a move that has sparked controversy in the Pacific Island nation and comes as Washington and China jostle for influence in the region. US Secretary of State Antony Blinken and Papua New Guinea (PNG) Prime Minister James Marape signed the pact and a maritime security agreement on Monday during Blinken’s visit to the capital Port Moresby. Advance text of the agreements were not released by either side, but the new defense cooperation was expected to expand US access to military and other facilities in PNG, bolstering Washington’s security ties in the South Pacific. That region – a constellation of sparsely populated island and archipelago nations and territories as well as New Zealand and Australia – has outsized strategic significance. The Pacific Islands, for example, were the site of decisive battles during World War Two. The region has taken on renewed importance for Washington as it seeks to bolster its relationships and presence in Asia amid rising tensions with an assertive China that’s rapidly expanded its naval capabilities in recent years. Those concerns were heightened last year after Beijing signed a security pact with the Solomon Islands – and tried, but failed, to win support for a sweeping, regional trade and security communique with Pacific Island nations. Blinken’s visit to PNG comes after US President Joe Biden last week cut short an Asia trip that would have included stops in Port Moresby and Sydney, Australia, due to ongoing debt ceiling negotiations at home. In a statement Saturday, PNG framed its agreement with the US as an opportunity to advance its infrastructure and capacity for national defense at a time of growing global security concerns. “Papua New Guinea does not have enemies but it pays to be prepared. Territorial dispute is (imminent), as in the case of Ukraine-Russia,” the statement said. “This agreement is not about geopolitics but rather recognizes the country’s need to build its defense capabilities because border disputes are inevitable in the future,” it said, adding it did not preclude the government from “working with” other countries, including China. China has become a significant player in the country’s economy, both as an investor and consumer of its rich natural resources. The US and PNG militaries already have a cooperative security assistance relationship focused primarily on joint humanitarian exercises and the training of PNG military personnel, according to the State Department. The new agreement has sparked debate in Papua New Guinea – including over a lack of transparency from the government on what it entailed, while purported leaked drafts circulated online. The pact would need Parliament’s approval and could face judicial challenges, experts say. But its signing sends a significant message to the region. “PNG signing a defense agreement signals to the rest of the Pacific that its largest nation has chosen the West – Australia and the US – as its security partner,” said Maholopa Laveil, FDC Pacific Fellow at the Lowy Institute, seconded from the University of Papua New Guinea. The defense pact and Fiji’s reported ending of its police training agreement with China earlier this year “are major wins, getting the largest Pacific nations on side for the US in its attempts to limit China’s influence in the region,” Laveil said, adding that Marape may “leverage the threat of China” to request more development assistance from the US. Meanwhile, Australia is preparing to sign its own security treaty with PNG. he US deal with PNG – especially on the heels of the Solomon Islands’ security pact with China last year – may also raise concerns about lines of alignment being drawn in a region that has long prioritized projecting strength through unity. “(Signing such pacts) can also create divisions,” said Patrick Kaiku, an academic focused on international relations at the University of Papua New Guinea, noting a perspective among Pacific Island states that they should not take sides in geopolitical rivalries. “If states are not adhering to it … that can also be a problem for regional solidarity,” he said. Blinken is expected to meet with leaders of the Pacific Island Forum regional body in Port Moresby on Monday, the forum has said, taking Biden’s place at the gathering. The cancellation of Biden’s trip – which would have been the first from a sitting US President to Papua New Guinea – has been characterized by some observers as a potential ding to Washington’s recent bid to up its engagement with the region. That bid has included opening embassies in the Solomon Islands and Tonga this year, while Biden hosted Pacific Island leaders in Washington for a summit in September and released the first-ever national strategy on engaging the Pacific Islands. “US President Joe Biden’s now-scrapped visit to PNG was meant to be a culmination of these efforts and send a powerful signal to Pacific Islanders about the US commitment to the region,” said Parker Novak, a non-resident follow at the Atlantic Council think tank in Washington’s Global China Hub. “Instead, it underlines skepticism about the United States’ ability to follow through on the promises it has made,” Novak said, adding that with Blinken’s visit and other expected diplomacy, it may not do “long-term damage to US efforts in the Pacific.”")

"us secretary of state antony blinken and papua new guinea prime minister james marape signed the pact and a maritime security agreement on monday. advance text of the agreements was not released by either side, but the new defense cooperation was expected to expand us access to military and other facilities in png, bolstering washington's security ties in the south pacific. png is a constellation of sparsely populated island and archipelago nations and territories as well as new zealand and australia."