In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
import random
import numpy as np
import torch

def set_seed(seed: int):
    """Set the seed for reproducibility across multiple libraries."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
n_gpu = torch.cuda.device_count()

# Data

In [4]:
import data_utils
train_path = "../data/absa/id/william/train.txt"
val_path = "../data/absa/id/william/dev.txt"
test_path = "../data/absa/id/william/test.txt"

train = data_utils.read_data(train_path)
val = data_utils.read_data(val_path)
test = data_utils.read_data(test_path)

In [5]:
train_tasks = [
    {
        "paradigm" : "extraction",
        "se_order" : "oa",
        "method" : "lego_absa"
    },
    {
        "paradigm" : "extraction",
        "se_order" : "as",
        "method" : "lego_absa"
    },
    {
        "paradigm" : "imputation",
        "reduced_se_order" : "oa",
        "se_order" : "oas",
        "method" : "lego_absa"
    },
    {
        "paradigm" : "imputation",
        "reduced_se_order" : "as",
        "se_order" : "oas",
        "method" : "lego_absa"
    },
]

val_tasks = [
    {
        "paradigm" : "extraction",
        "se_order" : "oas",
        "method" : "lego_absa"
    }
]

test_tasks = [
    {
        "paradigm" : "extraction",
        "se_order" : "oas",
        "method" : "lego_absa"
    }
]

In [6]:
train_ds = data_utils.data_gen(data=train, nt_se_order="aos", tasks=train_tasks, n_fold=4, algo="random", shuffle=True)
val_ds = data_utils.data_gen(data=val, nt_se_order="aos", tasks=val_tasks, n_fold=1, algo="round_robin", shuffle=False)
test_ds = data_utils.data_gen(data=test, nt_se_order="aos", tasks=test_tasks, n_fold=1, algo="round_robin", shuffle=False)

100%|██████████| 12000/12000 [00:02<00:00, 4254.84it/s]
100%|██████████| 1000/1000 [00:00<00:00, 17383.63it/s]
100%|██████████| 1000/1000 [00:00<00:00, 18298.08it/s]


In [7]:
for el in train_ds:
    if el["input"].startswith("pngen kembali lagi buat menginap"):
        print(el)

{'input': 'pngen kembali lagi buat menginap .| opinion : <extra_id_0> ,aspect : <extra_id_1>', 'output': 'NULL', 'se_order': 'oa'}
{'input': 'pngen kembali lagi buat menginap .| aspect : <extra_id_0> ,sentiment : <extra_id_1>', 'output': 'NULL', 'se_order': 'as'}


In [8]:
train_ds[0]

{'input': 'tempatnya , harga , dan pelayanan sesuai dengan harga .| opinion : sesuai ,aspect : tempatnya ,sentiment : <extra_id_0> ; opinion : sesuai ,aspect : harga ,sentiment : <extra_id_1> ; opinion : sesuai ,aspect : pelayanan ,sentiment : <extra_id_2>',
 'output': '<extra_id_0> sesuai <extra_id_1> tempatnya <extra_id_2> positive ; <extra_id_3> sesuai <extra_id_4> harga <extra_id_5> positive ; <extra_id_6> sesuai <extra_id_7> pelayanan <extra_id_8> positive',
 'se_order': 'oas'}

In [9]:
from datasets import Dataset

train_ds = Dataset.from_list(train_ds)
val_ds = Dataset.from_list(val_ds)
test_ds = Dataset.from_list(test_ds)

In [10]:
train_ds

Dataset({
    features: ['input', 'output', 'se_order'],
    num_rows: 8022
})

In [11]:
val_ds

Dataset({
    features: ['input', 'output', 'se_order'],
    num_rows: 998
})

In [12]:
test_ds

Dataset({
    features: ['input', 'output', 'se_order'],
    num_rows: 995
})

# Tokenize

In [13]:
from transformers import AutoTokenizer

encoding_args = {
    "max_length" : 256,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

encode_fn = lambda x: tokenizer(x["input"], text_target=x["output"], **encoding_args)

tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")



In [14]:
train_tok = train_ds.map(encode_fn, batched=True, remove_columns=train_ds.column_names)
train_tok.set_format("torch")

val_tok = val_ds.map(encode_fn, batched=True, remove_columns=val_ds.column_names)
val_tok.set_format("torch")

test_tok = test_ds.map(encode_fn, batched=True, remove_columns=test_ds.column_names)
test_tok.set_format("torch")

                                                                 

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

# Train

In [16]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 20,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 16//n_gpu,
    "per_device_eval_batch_size": 8,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./output",
    "logging_dir" : "./output/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

In [17]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [18]:
from transformers import Seq2SeqTrainer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (w

In [19]:
def preprocess_logits_for_metrics(logits, targets):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, targets

In [20]:
from evaluation import compute_metrics

catch_answer_fn = data_utils.AnswerCatcher().lego_absa
decoding_args = {
    "skip_special_tokens" : False
}

trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer,
        data_collator = data_collator,
        train_dataset = train_tok,
        eval_dataset = val_tok,
        compute_metrics = lambda eval_preds: compute_metrics(catch_answer_fn, eval_preds, decoding_args, tokenizer, val_ds["se_order"]),
        preprocess_logits_for_metrics=preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 8022
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5020
  Number of trainable parameters = 582401280
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 998
  Batch size = 8
  Num examples = 998
  Batch size = 8


INPUTS >> pelayanan sudah bagus, tempat juga bagus, akan lebih bagus lagi jika tempat untuk mencharger hp di tambah. terimakasih.| opinion : <extra_id_0>,aspect : <extra_id_1>,sentiment : <extra_id_2></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
TARGETS >> <extra_id_0> bagus <extra_id_1> pelayanan <extra_id_2> posi

Saving model checkpoint to ./output/checkpoint-502
Configuration saved in ./output/checkpoint-502/config.json
Configuration saved in ./output/checkpoint-502/config.json
Model weights saved in ./output/checkpoint-502/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-502/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-502/special_tokens_map.json
Copy vocab file to ./output/checkpoint-502/spiece.model
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


INPUTS >> pelayanan sudah bagus, tempat juga bagus, akan lebih bagus lagi jika tempat untuk mencharger hp di tambah. terimakasih.| opinion : <extra_id_0>,aspect : <extra_id_1>,sentiment : <extra_id_2></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
TARGETS >> <extra_id_0> bagus <extra_id_1> pelayanan <extra_id_2> posi

Saving model checkpoint to ./output/checkpoint-1004
Configuration saved in ./output/checkpoint-1004/config.json
Configuration saved in ./output/checkpoint-1004/config.json
Model weights saved in ./output/checkpoint-1004/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-1004/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1004/special_tokens_map.json
Copy vocab file to ./output/checkpoint-1004/spiece.model
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


INPUTS >> pelayanan sudah bagus, tempat juga bagus, akan lebih bagus lagi jika tempat untuk mencharger hp di tambah. terimakasih.| opinion : <extra_id_0>,aspect : <extra_id_1>,sentiment : <extra_id_2></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
TARGETS >> <extra_id_0> bagus <extra_id_1> pelayanan <extra_id_2> posi

Saving model checkpoint to ./output/checkpoint-1506
Configuration saved in ./output/checkpoint-1506/config.json
Configuration saved in ./output/checkpoint-1506/config.json
Model weights saved in ./output/checkpoint-1506/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-1506/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1506/special_tokens_map.json
Copy vocab file to ./output/checkpoint-1506/spiece.model
Deleting older checkpoint [output/checkpoint-502] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


INPUTS >> pelayanan sudah bagus, tempat juga bagus, akan lebih bagus lagi jika tempat untuk mencharger hp di tambah. terimakasih.| opinion : <extra_id_0>,aspect : <extra_id_1>,sentiment : <extra_id_2></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
TARGETS >> <extra_id_0> bagus <extra_id_1> pelayanan <extra_id_2> posi

Saving model checkpoint to ./output/checkpoint-2008
Configuration saved in ./output/checkpoint-2008/config.json
Configuration saved in ./output/checkpoint-2008/config.json
Model weights saved in ./output/checkpoint-2008/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-2008/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-2008/special_tokens_map.json
Copy vocab file to ./output/checkpoint-2008/spiece.model
Deleting older checkpoint [output/checkpoint-1004] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


INPUTS >> pelayanan sudah bagus, tempat juga bagus, akan lebih bagus lagi jika tempat untuk mencharger hp di tambah. terimakasih.| opinion : <extra_id_0>,aspect : <extra_id_1>,sentiment : <extra_id_2></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
TARGETS >> <extra_id_0> bagus <extra_id_1> pelayanan <extra_id_2> posi

Saving model checkpoint to ./output/checkpoint-2510
Configuration saved in ./output/checkpoint-2510/config.json
Configuration saved in ./output/checkpoint-2510/config.json
Model weights saved in ./output/checkpoint-2510/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-2510/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-2510/special_tokens_map.json
Copy vocab file to ./output/checkpoint-2510/spiece.model
Deleting older checkpoint [output/checkpoint-1506] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 998
  Batch size = 8


INPUTS >> pelayanan sudah bagus, tempat juga bagus, akan lebih bagus lagi jika tempat untuk mencharger hp di tambah. terimakasih.| opinion : <extra_id_0>,aspect : <extra_id_1>,sentiment : <extra_id_2></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
TARGETS >> <extra_id_0> bagus <extra_id_1> pelayanan <extra_id_2> posi

Saving model checkpoint to ./output/checkpoint-3012
Configuration saved in ./output/checkpoint-3012/config.json
Configuration saved in ./output/checkpoint-3012/config.json
Model weights saved in ./output/checkpoint-3012/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-3012/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-3012/special_tokens_map.json
Copy vocab file to ./output/checkpoint-3012/spiece.model
Deleting older checkpoint [output/checkpoint-2008] due to args.save_total_limit
