# Finetuning using IndoBART

In [1]:
! git clone https://github.com/indobenchmark/indonlg.git

Cloning into 'indonlg'...
remote: Enumerating objects: 152, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (113/113), done.[K
remote: Total 152 (delta 75), reused 97 (delta 34), pack-reused 0[K
Receiving objects: 100% (152/152), 2.32 MiB | 10.24 MiB/s, done.
Resolving deltas: 100% (75/75), done.


In [2]:
%cd /content/indonlg
! pip install -r requirements.txt

/content/indonlg
Collecting nltk==3.4.5
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 4.5 MB/s 
[?25hCollecting tqdm==4.48.2
  Downloading tqdm-4.48.2-py2.py3-none-any.whl (68 kB)
[K     |████████████████████████████████| 68 kB 5.6 MB/s 
[?25hCollecting torch==1.7.1
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K     |████████████████████████████████| 776.8 MB 18 kB/s 
[?25hCollecting datasets==1.4.1
  Downloading datasets-1.4.1-py3-none-any.whl (186 kB)
[K     |████████████████████████████████| 186 kB 45.0 MB/s 
[?25hCollecting tokenizers==0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.2 MB/s 
[?25hCollecting tensorflow==2.4.0
  Downloading tensorflow-2.4.0-cp37-cp37m-manylinux2010_x86_64.whl (394.7 MB)
[K     |████████████████████████████████| 394.7 MB 20 kB/s 

In [1]:
%cd /content/indonlg/examples

/content/indonlg/examples


In [2]:
import os, sys
sys.path.append('../')
os.chdir('../')

import torch
import shutil
import random
import numpy as np
import pandas as pd
from torch import optim
from transformers import MBartForConditionalGeneration

from indobenchmark import IndoNLGTokenizer
from utils.train_eval import train, evaluate
from utils.metrics import generation_metrics_fn
from utils.forward_fn import forward_generation
from utils.data_utils import MachineTranslationDataset, GenerationDataLoader

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2488.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1554.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2170.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2082.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2264.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3182.0, style=ProgressStyle(description…




In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
# Set random seed
# set_seed(26092020)

# Load Model

In [None]:
bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart-v2')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart-v2')

model = bart_model
model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1712.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526426289.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=931715.0, style=ProgressStyle(descripti…

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.





MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(40004, 768, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(40004, 768, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [None]:
count_param(model)

131543040

In [None]:
# Load best model
model.load_state_dict(torch.load("/content/drive/MyDrive/FinalProject/models/best_model_indobart_id-min.th"))

<All keys matched successfully>

# Prepare Dataset

In [None]:
# configs and args

lr = 1e-4
gamma = 0.9
lower = True
step_size = 1
beam_size = 5
max_norm = 10
early_stop = 5

max_seq_len = 512
grad_accumulate = 1
no_special_token = False
swap_source_target = True
model_type = 'indo-bart'
valid_criterion = 'SacreBLEU'

separator_id = 4
speaker_1_id = 5
speaker_2_id = 6

train_batch_size = 8
valid_batch_size = 8
test_batch_size = 8

source_lang = "[indonesian]"
target_lang = "[indonesian]"

optimizer = optim.Adam(model.parameters(), lr=lr)
src_lid = tokenizer.special_tokens_to_ids[source_lang]
tgt_lid = tokenizer.special_tokens_to_ids[target_lang]

model.config.decoder_start_token_id = tgt_lid

# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

# create directory
model_dir = './save/minang/example_id_min'
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

device = 'cuda0'
# set a specific cuda device
if "cuda" in device:
    torch.cuda.set_device(int(device[4:]))
    device = "cuda"
    model = model.cuda()

In [None]:
train_dataset_path = '/content/drive/MyDrive/11-737/Assignment 2/FinalProject/data/min-id/train_preprocess.json'
valid_dataset_path = '/content/drive/MyDrive/11-737/Assignment 2/FinalProject/data/min-id/valid_preprocess.json'
test_dataset_path = '/content/drive/MyDrive/11-737/Assignment 2/FinalProject/data/min-id/test_preprocess.json'

train_dataset = MachineTranslationDataset(train_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
valid_dataset = MachineTranslationDataset(valid_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
test_dataset = MachineTranslationDataset(test_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)

train_loader = GenerationDataLoader(dataset=train_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=train_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=True)  
valid_loader = GenerationDataLoader(dataset=valid_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=valid_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)
test_loader = GenerationDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                   batch_size=test_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)

# Test model to generate sequences

In [None]:
inputs = ['aku pergi ke toko obat membeli <mask>']
bart_input = tokenizer.prepare_input_for_generation(inputs, return_tensors='pt',
                                         lang_token = '[indonesian]', decoder_lang_token='[indonesian]')

bart_input.to(device)
bart_out = model(**bart_input)
print(tokenizer.decode(bart_input['input_ids'][0]))
print(tokenizer.decode(bart_out.logits.topk(1).indices[:,:].squeeze()))

<s> aku pergi ke toko obat membeli <mask> [indonesian]
<s> aku pergi ke toko obat membeli obat.


In [None]:
inputs = ['aku menyang pasar karo <mask>']
bart_input = tokenizer.prepare_input_for_generation(inputs, return_tensors='pt',
                                         lang_token = '[javanese]', decoder_lang_token='[javanese]')

bart_input.to(device)
bart_out = bart_model(**bart_input)
print(tokenizer.decode(bart_input['input_ids'][0]))
print(tokenizer.decode(bart_out.logits.topk(1).indices[:,:].squeeze()))

<s> aku menyang pasar karo <mask> [javanese]
<s> aku menyang pasar karo tuku </s>


In [None]:
inputs = ['kuring ka pasar senen meuli daging <mask>']
bart_input = tokenizer.prepare_input_for_generation(inputs, return_tensors='pt',
                                         lang_token = '[sundanese]', decoder_lang_token='[sundanese]')

bart_input.to(device)
bart_out = bart_model(**bart_input)
print(tokenizer.decode(bart_input['input_ids'][0]))
print(tokenizer.decode(bart_out.logits.topk(1).indices[:,:].squeeze()))

<s> kuring ka pasar senen meuli daging <mask> [sundanese]
<s> kuring ka pasar senen meuli daging sapi.


# Test model to translate

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')

TESTING... : 100%|██████████| 400/400 [16:12<00:00,  2.43s/it]


In [None]:
metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'hyp': test_hyp, 
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_result.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result.csv")

== Prediction Result ==
                                                 hyp  \
0  & qu ot ; bo eka  <0xE2> <0x80> <0x93> polemik...   
1                                     o ch re ic eps   
2  manusia dengan mudah menerima prinsip du alism...   
3  kode yang melakukan tugas  <0x40> - <0x40> tug...   
4  pernikahan tersebut tidak berakhir dengan baik...   

                                               label  
0  kah ad iran & qu ot ; bo eka mata & qu ot ; ma...  
1  sop hr onica o ch re ic eps ad olah kumbang ta...  
2  umum n yo manusia jo mudah man ari mo prinsip ...  
3  bagian kode nan mang ara jo an tugas  <0x40> -...  
4  namun panik ahan tu ind ak bar akh ia elok , s...  

== Model Performance ==
            BLEU  SacreBLEU    ROUGE1     ROUGE2     ROUGEL  ROUGELsum
count   1.000000   1.000000   1.00000   1.000000   1.000000   1.000000
mean   25.232849  25.267729  43.64145  27.991344  43.334062  43.358965
std          NaN        NaN       NaN        NaN        NaN        NaN


In [None]:
metric_df

Unnamed: 0,BLEU,SacreBLEU,ROUGE1,ROUGE2,ROUGEL,ROUGELsum
0,25.232849,25.267729,43.64145,27.991344,43.334062,43.358965


# Fine Tuning & Evaluation

In [None]:
# Train

n_epochs = 10

train(model, train_loader=train_loader, valid_loader=valid_loader, optimizer=optimizer, 
      forward_fn=forward_generation, metrics_fn=generation_metrics_fn, valid_criterion=valid_criterion, 
      tokenizer=tokenizer, n_epochs=n_epochs, evaluate_every=1, early_stop=early_stop, 
      grad_accum=grad_accumulate, step_size=step_size, gamma=gamma, 
      max_norm=max_norm, model_type=model_type, beam_size=beam_size,
      max_seq_len=max_seq_len, model_dir=model_dir, exp_id=0, fp16="", device=device)

(Epoch 1) TRAIN LOSS:1.3364 LR:0.00010000: 100%|██████████| 1447/1447 [09:48<00:00,  2.46it/s]


(Epoch 1) TRAIN LOSS:1.3364 BLEU:64.85 SacreBLEU:65.58 ROUGE1:78.21 ROUGE2:64.14 ROUGEL:77.44 ROUGELsum:77.44 LR:0.00010000


VALID LOSS:0.8606: 100%|██████████| 200/200 [00:26<00:00,  7.55it/s]


(Epoch 1) VALID LOSS:0.8606 BLEU:69.07 SacreBLEU:69.09 ROUGE1:83.50 ROUGE2:71.22 ROUGEL:83.03 ROUGELsum:83.02


(Epoch 2) TRAIN LOSS:0.6684 LR:0.00009000: 100%|██████████| 1447/1447 [09:49<00:00,  2.46it/s]


(Epoch 2) TRAIN LOSS:0.6684 BLEU:75.51 SacreBLEU:76.05 ROUGE1:86.82 ROUGE2:76.07 ROUGEL:86.42 ROUGELsum:86.42 LR:0.00009000


VALID LOSS:0.7751: 100%|██████████| 200/200 [00:26<00:00,  7.50it/s]


(Epoch 2) VALID LOSS:0.7751 BLEU:71.56 SacreBLEU:71.58 ROUGE1:85.10 ROUGE2:73.77 ROUGEL:84.69 ROUGELsum:84.67


(Epoch 3) TRAIN LOSS:0.4484 LR:0.00008100: 100%|██████████| 1447/1447 [09:48<00:00,  2.46it/s]


(Epoch 3) TRAIN LOSS:0.4484 BLEU:80.26 SacreBLEU:80.70 ROUGE1:89.90 ROUGE2:80.99 ROUGEL:89.62 ROUGELsum:89.63 LR:0.00008100


VALID LOSS:0.7599: 100%|██████████| 200/200 [00:26<00:00,  7.49it/s]


(Epoch 3) VALID LOSS:0.7599 BLEU:72.86 SacreBLEU:72.87 ROUGE1:85.66 ROUGE2:74.62 ROUGEL:85.24 ROUGELsum:85.22


(Epoch 4) TRAIN LOSS:0.2937 LR:0.00007290: 100%|██████████| 1447/1447 [09:47<00:00,  2.46it/s]


(Epoch 4) TRAIN LOSS:0.2937 BLEU:84.92 SacreBLEU:85.25 ROUGE1:92.72 ROUGE2:85.86 ROUGEL:92.52 ROUGELsum:92.52 LR:0.00007290


VALID LOSS:0.7935: 100%|██████████| 200/200 [00:26<00:00,  7.48it/s]


(Epoch 4) VALID LOSS:0.7935 BLEU:72.88 SacreBLEU:72.90 ROUGE1:85.74 ROUGE2:74.86 ROUGEL:85.36 ROUGELsum:85.34


(Epoch 5) TRAIN LOSS:0.1921 LR:0.00006561: 100%|██████████| 1447/1447 [09:48<00:00,  2.46it/s]


(Epoch 5) TRAIN LOSS:0.1921 BLEU:89.17 SacreBLEU:89.40 ROUGE1:95.03 ROUGE2:90.22 ROUGEL:94.91 ROUGELsum:94.91 LR:0.00006561


VALID LOSS:0.8134: 100%|██████████| 200/200 [00:26<00:00,  7.49it/s]


(Epoch 5) VALID LOSS:0.8134 BLEU:73.45 SacreBLEU:73.47 ROUGE1:86.09 ROUGE2:75.29 ROUGEL:85.68 ROUGELsum:85.67


(Epoch 6) TRAIN LOSS:0.1250 LR:0.00005905: 100%|██████████| 1447/1447 [09:47<00:00,  2.46it/s]


(Epoch 6) TRAIN LOSS:0.1250 BLEU:92.67 SacreBLEU:92.82 ROUGE1:96.71 ROUGE2:93.49 ROUGEL:96.65 ROUGELsum:96.65 LR:0.00005905


VALID LOSS:0.8431: 100%|██████████| 200/200 [00:26<00:00,  7.49it/s]


(Epoch 6) VALID LOSS:0.8431 BLEU:73.72 SacreBLEU:73.73 ROUGE1:86.17 ROUGE2:75.48 ROUGEL:85.80 ROUGELsum:85.83


(Epoch 7) TRAIN LOSS:0.0850 LR:0.00005314: 100%|██████████| 1447/1447 [09:47<00:00,  2.46it/s]


(Epoch 7) TRAIN LOSS:0.0850 BLEU:95.00 SacreBLEU:95.10 ROUGE1:97.87 ROUGE2:95.78 ROUGEL:97.84 ROUGELsum:97.84 LR:0.00005314


VALID LOSS:0.8772: 100%|██████████| 200/200 [00:26<00:00,  7.51it/s]


(Epoch 7) VALID LOSS:0.8772 BLEU:74.04 SacreBLEU:74.05 ROUGE1:86.43 ROUGE2:75.86 ROUGEL:85.99 ROUGELsum:86.02


(Epoch 8) TRAIN LOSS:0.0622 LR:0.00004783: 100%|██████████| 1447/1447 [09:46<00:00,  2.47it/s]


(Epoch 8) TRAIN LOSS:0.0622 BLEU:96.46 SacreBLEU:96.53 ROUGE1:98.52 ROUGE2:97.03 ROUGEL:98.50 ROUGELsum:98.50 LR:0.00004783


VALID LOSS:0.8970: 100%|██████████| 200/200 [00:26<00:00,  7.54it/s]


(Epoch 8) VALID LOSS:0.8970 BLEU:74.50 SacreBLEU:74.52 ROUGE1:86.48 ROUGE2:76.12 ROUGEL:86.09 ROUGELsum:86.12


(Epoch 9) TRAIN LOSS:0.0467 LR:0.00004305: 100%|██████████| 1447/1447 [09:45<00:00,  2.47it/s]


(Epoch 9) TRAIN LOSS:0.0467 BLEU:97.43 SacreBLEU:97.48 ROUGE1:98.95 ROUGE2:97.88 ROUGEL:98.94 ROUGELsum:98.94 LR:0.00004305


VALID LOSS:0.9125: 100%|██████████| 200/200 [00:26<00:00,  7.51it/s]


(Epoch 9) VALID LOSS:0.9125 BLEU:74.46 SacreBLEU:74.48 ROUGE1:86.51 ROUGE2:76.17 ROUGEL:86.15 ROUGELsum:86.15
count stop: 1


(Epoch 10) TRAIN LOSS:0.0368 LR:0.00003874: 100%|██████████| 1447/1447 [09:48<00:00,  2.46it/s]


(Epoch 10) TRAIN LOSS:0.0368 BLEU:98.08 SacreBLEU:98.12 ROUGE1:99.21 ROUGE2:98.39 ROUGEL:99.20 ROUGELsum:99.20 LR:0.00003874


VALID LOSS:0.9165: 100%|██████████| 200/200 [00:26<00:00,  7.47it/s]


(Epoch 10) VALID LOSS:0.9165 BLEU:74.64 SacreBLEU:74.66 ROUGE1:86.65 ROUGE2:76.34 ROUGEL:86.27 ROUGELsum:86.29


In [None]:
# Load best model
model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))

<All keys matched successfully>

In [None]:
# Evaluate
test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')

TESTING... : 100%|██████████| 400/400 [15:22<00:00,  2.31s/it]


In [None]:
metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'hyp': test_hyp, 
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_indobart_id-min.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result_id-min.csv")

== Prediction Result ==
                                                 hyp  \
0  ti bon eka mat o & qu ot ; many us ua polemik ...   
1  sop hr onica o ch re ic eps ad olah kumbang ta...   
2  umum n yo manusia jo mudah man ari mo prinsip ...   
3  bagian kode nan mang ara jo an tugas  <0x40> -...   
4  namun panik ahan tasa bu ik ind ak bar akh ia ...   

                                               label  
0  kah ad iran & qu ot ; bo eka mata & qu ot ; ma...  
1  sop hr onica o ch re ic eps ad olah kumbang ta...  
2  umum n yo manusia jo mudah man ari mo prinsip ...  
3  bagian kode nan mang ara jo an tugas  <0x40> -...  
4  namun panik ahan tu ind ak bar akh ia elok , s...  

== Model Performance ==
            BLEU  SacreBLEU     ROUGE1     ROUGE2     ROUGEL  ROUGELsum
count   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000
mean   65.394824  65.407026  80.413223  70.511056  79.804539  79.793947
std          NaN        NaN        NaN        NaN        NaN        

In [None]:
! cp './save/minang/example_id_min/best_model_0.th' '/content/drive/My Drive/'

In [None]:
! cp './save/minang/example_id_min/prediction_indobart_id-min.csv' '/content/drive/My Drive/'