In [1]:
import random
import numpy as np
import torch

def set_seed(seed: int):
    """Set the seed for reproducibility across multiple libraries."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Data

In [2]:
import data_utils
train_path = "../data/absa/id/william/train.txt"
val_path = "../data/absa/id/william/dev.txt"
test_path = "../data/absa/id/william/test.txt"

train = data_utils.read_data(train_path)
val = data_utils.read_data(val_path)
test = data_utils.read_data(test_path)

In [3]:
train_tasks = [
    {
        "paradigm" : "extraction",
        "se_order" : "oa",
        "method" : "lego_absa"
    },
    {
        "paradigm" : "extraction",
        "se_order" : "as",
        "method" : "lego_absa"
    },
    {
        "paradigm" : "imputation",
        "reduced_se_order" : "oa",
        "se_order" : "oas",
        "method" : "lego_absa"
    },
    {
        "paradigm" : "imputation",
        "reduced_se_order" : "as",
        "se_order" : "oas",
        "method" : "lego_absa"
    },
]

val_tasks = [
    {
        "paradigm" : "extraction",
        "se_order" : "oas",
        "method" : "lego_absa"
    }
]

test_tasks = [
    {
        "paradigm" : "extraction",
        "se_order" : "oas",
        "method" : "lego_absa"
    }
]

In [4]:
train_ds = data_utils.data_gen(data=train, nt_se_order="aos", tasks=train_tasks, n_fold=4, algo="random", shuffle=True)
val_ds = data_utils.data_gen(data=val, nt_se_order="aos", tasks=val_tasks, n_fold=1, algo="round_robin", shuffle=False)
test_ds = data_utils.data_gen(data=test, nt_se_order="aos", tasks=test_tasks, n_fold=1, algo="round_robin", shuffle=False)

100%|██████████| 12000/12000 [00:05<00:00, 2171.51it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14073.33it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14044.17it/s]


In [5]:
for el in train_ds:
    if el["input"].startswith("pngen kembali lagi buat menginap"):
        print(el)

{'input': 'pngen kembali lagi buat menginap .| opinion : <extra_id_0> ,aspect : <extra_id_1>', 'output': 'NULL', 'se_order': 'oa'}
{'input': 'pngen kembali lagi buat menginap .| aspect : <extra_id_0> ,sentiment : <extra_id_1>', 'output': 'NULL', 'se_order': 'as'}


In [6]:
train_ds[0]

{'input': 'tempatnya , harga , dan pelayanan sesuai dengan harga .| opinion : sesuai ,aspect : tempatnya ,sentiment : <extra_id_0> ; opinion : sesuai ,aspect : harga ,sentiment : <extra_id_1> ; opinion : sesuai ,aspect : pelayanan ,sentiment : <extra_id_2>',
 'output': '<extra_id_0> sesuai <extra_id_1> tempatnya <extra_id_2> positive ; <extra_id_3> sesuai <extra_id_4> harga <extra_id_5> positive ; <extra_id_6> sesuai <extra_id_7> pelayanan <extra_id_8> positive',
 'se_order': 'oas'}

In [7]:
from datasets import Dataset

train_ds = Dataset.from_list(train_ds)
val_ds = Dataset.from_list(val_ds)
test_ds = Dataset.from_list(test_ds)

In [8]:
train_ds

Dataset({
    features: ['input', 'output', 'se_order'],
    num_rows: 8022
})

In [9]:
val_ds

Dataset({
    features: ['input', 'output', 'se_order'],
    num_rows: 998
})

In [10]:
test_ds

Dataset({
    features: ['input', 'output', 'se_order'],
    num_rows: 995
})

# Tokenize

In [11]:
from transformers import AutoTokenizer

encoding_args = {
    "max_length" : 256,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

encode_fn = lambda x: tokenizer(x["input"], text_target=x["output"], **encoding_args)

tokenizer = AutoTokenizer.from_pretrained("google/mt5-large")

2023-06-26 14:21:42.690312: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-26 14:21:42.957241: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-26 14:21:42.957268: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-26 14:21:44.288402: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [13]:
train_tok = train_ds.map(encode_fn, batched=True, remove_columns=train_ds.column_names)
train_tok.set_format("torch")

val_tok = val_ds.map(encode_fn, batched=True, remove_columns=val_ds.column_names)
val_tok.set_format("torch")

test_tok = test_ds.map(encode_fn, batched=True, remove_columns=test_ds.column_names)
test_tok.set_format("torch")

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

# Train

In [None]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 10,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./output",
    "logging_dir" : "./output/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

In [None]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

In [None]:
from transformers import Seq2SeqTrainer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
model.to(device)

In [None]:
from evaluation import compute_metrics

catch_answer_fn = data_utils.AnswerCatcher().lego_absa
decoding_args = {
    "skip_special_tokens" : False
}

trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer,
        data_collator = data_collator,
        train_dataset = train_tok,
        eval_dataset = val_tok,
        compute_metrics = lambda eval_preds: compute_metrics(catch_answer_fn, eval_preds, decoding_args, tokenizer, val_ds["se_order"]),
    )

trainer.train()