In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
import sys
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
sys.path.append("../../src/")
import data_utils

  from .autonotebook import tqdm as notebook_tqdm


# Dataset Utilities

In [3]:
peng_dir = dict(
    lap14 = "../../data/absa/en/peng/14lap",
    res14 = "../../data/absa/en/peng/14res",
    res15 = "../../data/absa/en/peng/15res",
    res16 = "../../data/absa/en/peng/16res"
)

wan_dir = dict(
    res15 = "../../data/absa/en/wan/interim/rest15",
    res16 = "../../data/absa/en/wan/interim/rest16"
)
    
zhang_dir = dict(
    res15 = "../../data/absa/en/zhang/interim/interim_2/rest15",
    res16 = "../../data/absa/en/zhang/interim/interim_2/rest16"
)

william_dir = dict(
    hotel = "../../data/absa/id/william"
)

peng = dict(
    lap14 = dict(
        train = data_utils.read_data(path=peng_dir["lap14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["lap14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["lap14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res14 = dict(
        train = data_utils.read_data(path=peng_dir["res14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res15 = dict(
        train = data_utils.read_data(path=peng_dir["res15"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res15"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res15"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res16 = dict(
        train = data_utils.read_data(path=peng_dir["res16"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res16"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res16"] + "/test_triplets.txt",
                                     target_format="aos")
    )
)

wan = dict(
    res15 = dict(
        train = data_utils.read_data(path=wan_dir["res15"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res15"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res15"] + "/test.txt",
                                     target_format="acs")
    ),
    res16 = dict(
        train = data_utils.read_data(path=wan_dir["res16"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res16"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res16"] + "/test.txt",
                                     target_format="acs")
    )
)

zhang = dict(
    res15 = dict(
        train = data_utils.read_data(path=zhang_dir["res15"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res15"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res15"] + "/test.txt",
                                     target_format="acso")
    ),
    res16 = dict(
        train = data_utils.read_data(path=zhang_dir["res16"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res16"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res16"] + "/test.txt",
                                     target_format="acso")
    )
)

william = dict(
    hotel = dict(
        train = data_utils.read_data(path=william_dir["hotel"] + "/train.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=william_dir["hotel"] + "/dev.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=william_dir["hotel"] + "/test.txt",
                                     target_format="aos")
    )
)

# Data Preprocessing 1

In [4]:
data_utils.SENTIMENT_ELEMENT = {'a' : "aspect", 'o' : "opinion", 's' : "sentiment", 'c' : "category"}

1. AOS (ASTE)
    * AO
    * AS
    * A
    * O

2. ACS (TASD)
    * AS
    * CS
    * A
    * C

3. ACOS
    * AO
    * AS
    * CS
    * A
    * O
    * C

In [5]:
task_tree = {
    "oas" : ["oa","as",'a','o'],
    "asc" : ["as","sc",'a','c'],
    "oasc" : ["oa","as","sc",'a','o','c']
}

all_task = []
for k,v1 in task_tree.items():
    if k not in all_task:
        all_task.append(k)
    for v2 in v1:
        if v2 not in all_task:
            all_task.append(v2)

print(all_task)

['oas', 'oa', 'as', 'a', 'o', 'asc', 'sc', 'c', 'oasc']


In [6]:
data_utils.remove_duplicate_targets(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"ao"))

[{'aspect': 'battery life', 'opinion': 'good'}]

Handle mix may not be a must, but we'll see it later. Will be problematic if like as (UABSA / E2E ABSA) used for training AOS (ASTE) --> may be for further experiment because we will insert imputing later on

In [7]:
data_utils.handle_mix_sentiment(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"aos"))

[{'aspect': 'battery life', 'opinion': 'good', 'sentiment': 'mixed'}]

In [8]:
from copy import deepcopy

# Peng (ASTE/AOS)
peng_intermediate = dict()

for domain, v1 in peng.items():
    peng_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        peng_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = peng[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            peng_intermediate[domain][task][split] = ds_copy

In [9]:
# Wan (TASD/ACS)
wan_intermediate = dict()

for domain, v1 in wan.items():
    wan_intermediate[domain] = dict()
    for task in ["asc"] + task_tree["asc"]:
        wan_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = wan[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            wan_intermediate[domain][task][split] = ds_copy

In [10]:
# Zhang (ACOS)
zhang_intermediate = dict()

for domain, v1 in zhang.items():
    zhang_intermediate[domain] = dict()
    for task in ["oasc"] + task_tree["oasc"]:
        zhang_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = zhang[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            zhang_intermediate[domain][task][split] = ds_copy

In [11]:
# William (AOS ID)
william_intermediate = dict()

for domain, v1 in william.items():
    william_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        william_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = william[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            william_intermediate[domain][task][split] = ds_copy

# Answer Engineering

In [12]:
mask = "<extra_id_X>"

In [13]:
def construct_answer(targets,se_order):
    result = []
    counter = 0
    for t in targets:
        constructed_t = ""
        for se in se_order:
            if counter > 99:
                raise Exception("Extra id more than 99!")
            constructed_t += ' ' + mask.replace('X',str(counter)) + ' ' + t[data_utils.SENTIMENT_ELEMENT[se]]
            counter += 1
        constructed_t = constructed_t.strip()
        result.append(constructed_t)
    result = " ; ".join(result)
    return result

In [14]:
construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")

'<extra_id_0> no <extra_id_1> GUI <extra_id_2> negative ; <extra_id_3> dark <extra_id_4> screen <extra_id_5> negative ; <extra_id_6> steady <extra_id_7> power light <extra_id_8> neutral ; <extra_id_9> steady <extra_id_10> hard drive light <extra_id_11> negative'

# Prompt Engineering

In [15]:
def construct_prompt(text,se_order):
    prompt = []
    for counter, se in enumerate(se_order):
        prompt.append(data_utils.SENTIMENT_ELEMENT[se] + " : " + mask.replace('X',str(counter)))
    prompt = " ,".join(prompt)
    result = text + "| " + prompt
    return result

In [16]:
construct_prompt(peng_intermediate["lap14"]["oas"]["train"][4]["text"],"oas")

'One night I turned the freaking thing off after using it , the next day I turn it on , no GUI , screen all dark , power light steady , hard drive light steady and not flashing as it usually does .| opinion : <extra_id_0> ,aspect : <extra_id_1> ,sentiment : <extra_id_2>'

# Answer Catch

In [17]:
import re

def catch_answer(output,se_order):
    pattern = r""
    for se in se_order:
        if se != 's':
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*"
        else:
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*"
    found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
    for i in range(len(found)):
        for k, v in found[i].items():
            found[i][k] = found[i][k].strip()
    return found

In [18]:
output = construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")
se_order = "oas"
catch_answer(output,se_order)

[{'opinion': 'no', 'aspect': 'GUI', 'sentiment': 'negative'},
 {'opinion': 'dark', 'aspect': 'screen', 'sentiment': 'negative'},
 {'opinion': 'steady', 'aspect': 'power light', 'sentiment': 'neutral'},
 {'opinion': 'steady', 'aspect': 'hard drive light', 'sentiment': 'negative'}]

In [19]:
output

'<extra_id_0> no <extra_id_1> GUI <extra_id_2> negative ; <extra_id_3> dark <extra_id_4> screen <extra_id_5> negative ; <extra_id_6> steady <extra_id_7> power light <extra_id_8> neutral ; <extra_id_9> steady <extra_id_10> hard drive light <extra_id_11> negative'

# Data Preprocessing 2

In [20]:
from datasets import Dataset

peng_2 = dict()
for domain, v1 in peng_intermediate.items():
    peng_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in peng_intermediate[domain][basic_task]["train"]:
            peng_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in peng_intermediate[domain]["oas"]["val"]:
        peng_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in peng_intermediate[domain]["oas"]["test"]:
        peng_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    peng_2[domain]["train"] = Dataset.from_list(peng_2[domain]["train"])
    peng_2[domain]["val"] = Dataset.from_list(peng_2[domain]["val"])
    peng_2[domain]["test"] = Dataset.from_list(peng_2[domain]["test"])

wan_2 = dict()
for domain, v1 in wan_intermediate.items():
    wan_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["asc"]:
        for el in wan_intermediate[domain][basic_task]["train"]:
            wan_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in wan_intermediate[domain]["asc"]["val"]:
        wan_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    # TEST
    for el in wan_intermediate[domain]["asc"]["test"]:
        wan_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    wan_2[domain]["train"] = Dataset.from_list(wan_2[domain]["train"])
    wan_2[domain]["val"] = Dataset.from_list(wan_2[domain]["val"])
    wan_2[domain]["test"] = Dataset.from_list(wan_2[domain]["test"])

zhang_2 = dict()
for domain, v1 in zhang_intermediate.items():
    zhang_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oasc"]:
        for el in zhang_intermediate[domain][basic_task]["train"]:
            zhang_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in zhang_intermediate[domain]["oasc"]["val"]:
        zhang_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    # TEST
    for el in zhang_intermediate[domain]["oasc"]["test"]:
        zhang_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    zhang_2[domain]["train"] = Dataset.from_list(zhang_2[domain]["train"])
    zhang_2[domain]["val"] = Dataset.from_list(zhang_2[domain]["val"])
    zhang_2[domain]["test"] = Dataset.from_list(zhang_2[domain]["test"])

william_2 = dict()
for domain, v1 in william_intermediate.items():
    william_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in william_intermediate[domain][basic_task]["train"]:
            william_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in william_intermediate[domain]["oas"]["val"]:
        william_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in william_intermediate[domain]["oas"]["test"]:
        william_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    william_2[domain]["train"] = Dataset.from_list(william_2[domain]["train"])
    william_2[domain]["val"] = Dataset.from_list(william_2[domain]["val"])
    william_2[domain]["test"] = Dataset.from_list(william_2[domain]["test"])

In [21]:
william_2["hotel"]["train"][69]

{'input': 'tempat yag bagus dan nyaman untuk istirahat tetapi tolong tvnya perlu di perbaiki channelnya karena banyak semutnya digambar dan water heaternya tidak bisa jadi mandi air dingin terus .| opinion : <extra_id_0> ,aspect : <extra_id_1>',
 'output': '<extra_id_0> bagus <extra_id_1> tempat ; <extra_id_2> nyaman <extra_id_3> tempat ; <extra_id_4> perlu di perbaiki <extra_id_5> tvnya ; <extra_id_6> tidak bisa <extra_id_7> water heaternya',
 'task': 'oa'}

# Prepare Tokenized Dataset

## English

In [22]:
tokenizer_en = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [23]:
encoding_args = {
    "max_length" : 512,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

In [24]:
def encode_en(dataset):
    result = tokenizer_en(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [25]:
peng_tok = dict()
for domain, v1 in peng_2.items():
    peng_tok[domain] = dict()
    for split, v2 in v1.items():
        peng_tok[domain][split] = peng_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output"])

                                                                  

In [26]:
wan_tok = dict()
for domain, v1 in wan_2.items():
    wan_tok[domain] = dict()
    for split, v2 in v1.items():
        wan_tok[domain][split] = wan_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output"])

                                                                  

In [27]:
zhang_tok = dict()
for domain, v1 in zhang_2.items():
    zhang_tok[domain] = dict()
    for split, v2 in v1.items():
        zhang_tok[domain][split] = zhang_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output"])

                                                                  

## Indo

In [28]:
tokenizer_id = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base")

In [29]:
def encode_id(dataset):
    result = tokenizer_id(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [30]:
william_tok = dict()
for domain, v1 in william_2.items():
    william_tok[domain] = dict()
    for split, v2 in v1.items():
        william_tok[domain][split] = william_2[domain][split].map(encode_id,batched=True,remove_columns=["input","output"])

                                                                    

# Data Collator

## English

In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator_en = DataCollatorForSeq2Seq(tokenizer=tokenizer_en)

## Indo

In [32]:
data_collator_id = DataCollatorForSeq2Seq(tokenizer=tokenizer_id)

# Compute Metrics

In [33]:
from transformers import EvalPrediction
from evaluation import recall, precision, f1_score, summary_score
from typing import List, Dict, Tuple
import numpy as np

def seperate_target_prediction_per_task(predictions:List[List[Dict]],targets:List[List[Dict]],tasks:List) -> Tuple[Dict[str,List],Dict[str,List]]:
    per_task_targets = {}
    per_task_predictions = {}
    for target, prediction, task in zip(targets,predictions,tasks):
        if task not in per_task_targets.keys():
            per_task_targets[task] = []
        if task not in per_task_predictions.keys():
            per_task_predictions[task] = []
        per_task_targets[task].append(target)
        per_task_predictions[task].append(prediction)
    return per_task_targets, per_task_predictions

def preprocess_eval_preds(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer):
    input_ids = eval_preds.inputs
    target_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions

    # In case the model returns more than the prediction logits
    if isinstance(input_ids, tuple):
        input_ids = input_ids[0]
    if isinstance(target_ids, tuple):
        target_ids = target_ids[0]
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    input_ids = np.argmax(input_ids,axis=-1) if len(input_ids.shape) == 3 else input_ids # in case not predict with generate
    target_ids = np.argmax(target_ids,axis=-1) if len(target_ids.shape) == 3 else target_ids # in case not predict with generate
    prediction_ids = np.argmax(pred_ids,axis=-1) if len(pred_ids.shape) == 3 else pred_ids # in case not predict with generate

    input_ids = [[token for token in row if token != -100] for row in input_ids]
    target_ids = [[token for token in row if token != -100] for row in target_ids]
    prediction_ids = [[token for token in row if token != -100] for row in prediction_ids]

    inputs = tokenizer.batch_decode(input_ids,**decoding_args)
    targets = tokenizer.batch_decode(target_ids,**decoding_args)
    predictions = tokenizer.batch_decode(prediction_ids,**decoding_args)

    return inputs, targets, predictions

def compute_metrics(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer,tasks:List) -> Dict[str,float]: # MAY NOT BE SUFFICIATE FOR CAUSAL LM
        """
        ### DESC
            Method to compute the metrics.
        ### PARAMS
        * eval_preds: EvalPrediction instance from training.
        * decoding_args: Decoding arguments.
        ### RETURN
        * metrics: Dictionary of metrics.
        """
        inputs, targets, predictions = preprocess_eval_preds(eval_preds,decoding_args,tokenizer)

        # print("[RESULT EXAMPLE]")
        # print(">> INPUT:",inputs[:2])
        # print(">> TARGETS:",targets[:2])
        # print(">> OUTPUT:",predictions[:2])
        # print("[END]")

        # catch_answer(output,se_order)

        targets = [catch_answer(text,task) for text,task in zip(targets,tasks) if task != "non_absa"]
        predictions = [catch_answer(text,task) for text,task in zip(predictions,tasks) if task != "non_absa"]


        per_task_targets, per_task_predictions = seperate_target_prediction_per_task(predictions, targets, tasks)
        
        metrics = {}

        metrics["overall_recall"] = recall(predictions,targets)
        metrics["overall_precision"] = precision(predictions,targets)
        metrics["overall_f1_score"] = f1_score(predictions,targets)

        for task in per_task_targets.keys():
            if task == "non_absa":
                continue
            metrics[f"{task}_recall"] = recall(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_precision"] = precision(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_f1_score"] = f1_score(per_task_predictions[task],per_task_targets[task])
        
        return metrics

# Train Arguments

In [34]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 20,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./t5",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

# Model

## English

In [35]:
model_en = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

## Indo

In [36]:
model_id = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base")

# Train

In [37]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [38]:
model_en.to(device)
model_id.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [39]:
from transformers import Seq2SeqTrainer

trainer = {
    "peng" : {},
    "wan" : {},
    "zhang" : {},
    "william" : {}
}

decoding_args = {
    "skip_special_tokens" : False
}

def preprocess_logits_for_metrics(logits, labels):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, labels

# for domain_peng in peng_2.keys():
#     trainer["peng"][domain_peng] = Seq2SeqTrainer(
#         model = model_en,
#         args = train_args,
#         tokenizer = tokenizer_en,
#         data_collator = data_collator_en,
#         train_dataset = peng_tok[domain_peng]["train"],
#         eval_dataset = peng_tok[domain_peng]["val"],
#         compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2[domain_peng]["val"]["task"]),
#         preprocess_logits_for_metrics = preprocess_logits_for_metrics
#     )

# for domain_wan in wan_2.keys():
#     trainer["wan"][domain_wan] = Seq2SeqTrainer(
#         model = model_en,
#         args = train_args,
#         tokenizer = tokenizer_en,
#         data_collator = data_collator_en,
#         train_dataset = wan_tok[domain_wan]["train"],
#         eval_dataset = wan_tok[domain_wan]["val"],
#         compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2[domain_wan]["val"]["task"]),
#         preprocess_logits_for_metrics = preprocess_logits_for_metrics
#     )

# for domain_zhang in zhang_2.keys():
#     trainer["zhang"][domain_zhang] = Seq2SeqTrainer(
#         model = model_en,
#         args = train_args,
#         tokenizer = tokenizer_en,
#         data_collator = data_collator_en,
#         train_dataset = zhang_tok[domain_zhang]["train"],
#         eval_dataset = zhang_tok[domain_zhang]["val"],
#         compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2[domain_zhang]["val"]["task"]),
#         preprocess_logits_for_metrics = preprocess_logits_for_metrics
#     )

# for domain_william in william_2.keys():
#     trainer["william"][domain_william] = Seq2SeqTrainer(
#         model = model_id,
#         args = train_args,
#         tokenizer = tokenizer_id,
#         data_collator = data_collator_id,
#         train_dataset = william_tok[domain_william]["train"],
#         eval_dataset = william_tok[domain_william]["val"],
#         compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_id,william_2[domain]["val"]["task"]),
#         preprocess_logits_for_metrics = preprocess_logits_for_metrics
#     )

In [41]:
model_en = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
trainer["peng"]["lap14"] = Seq2SeqTrainer(
        model = model_en,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["lap14"]["train"],
        eval_dataset = peng_tok["lap14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["lap14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer["peng"]["lap14"].train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3624
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2280
  Number of trainable parameters = 222903552


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-114
Configuration saved in ./t5/checkpoint-114/config.json
Configuration saved in ./t5/checkpoint-114/config.json
Model weights saved in ./t5/checkpoint-114/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-114/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-114/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-114/spiece.model
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-228
Configuration saved in ./t5/checkpoint-228/config.json
Configuration saved in ./t5/checkpoint-228/config.json
Model weights saved in ./t5/checkpoint-228/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-228/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-228/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-228/spiece.model


Saving model checkpoint to ./t5/checkpoint-342
Configuration saved in ./t5/checkpoint-342/config.json
Configuration saved in ./t5/checkpoint-342/config.json
Model weights saved in ./t5/checkpoint-342/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-342/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-342/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-342/spiece.model
Deleting older checkpoint [t5/checkpoint-114] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-456
Configuration saved in ./t5/checkpoint-456/config.json
Configuration saved in ./t5/checkpoint-456/config.json
Model weights saved in ./t5/checkpoint-456/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-456/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-456/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-456/spiece.model
Deleting older checkpoint [t5/checkpoint-228] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-570
Configuration saved in ./t5/checkpoint-570/config.json
Configuration saved in ./t5/checkpoint-570/config.json
Model weights saved in ./t5/checkpoint-570/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-570/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-570/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-570/spiece.model
Deleting older checkpoint [t5/checkpoint-342] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-684
Configuration saved in ./t5/checkpoint-684/config.json
Configuration saved in ./t5/checkpoint-684/config.json
Model weights saved in ./t5/checkpoint-684/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-684/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-684/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-684/spiece.model
Deleting older checkpoint [t5/checkpoint-570] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-798
Configuration saved in ./t5/checkpoint-798/config.json
Configuration saved in ./t5/checkpoint-798/config.json
Model weights saved in ./t5/checkpoint-798/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-798/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-798/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-798/spiece.model
Deleting older checkpoint [t5/checkpoint-684] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-912
Configuration saved in ./t5/checkpoint-912/config.json
Configuration saved in ./t5/checkpoint-912/config.json
Model weights saved in ./t5/checkpoint-912/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-912/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-912/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-912/spiece.model
Deleting older checkpoint [t5/checkpoint-798] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1026
Configuration saved in ./t5/checkpoint-1026/config.json
Configuration saved in ./t5/checkpoint-1026/config.json
Model weights saved in ./t5/checkpoint-1026/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1026/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1026/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1026/spiece.model
Deleting older checkpoint [t5/checkpoint-912] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1140
Configuration saved in ./t5/checkpoint-1140/config.json
Configuration saved in ./t5/checkpoint-1140/config.json
Model weights saved in ./t5/checkpoint-1140/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1140/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1140/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1140/spiece.model
Deleting older checkpoint [t5/checkpoint-1026] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1254
Configuration saved in ./t5/checkpoint-1254/config.json
Configuration saved in ./t5/checkpoint-1254/config.json
Model weights saved in ./t5/checkpoint-1254/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1254/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1254/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1254/spiece.model
Deleting older checkpoint [t5/checkpoint-1140] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1368
Configuration saved in ./t5/checkpoint-1368/config.json
Configuration saved in ./t5/checkpoint-1368/config.json
Model weights saved in ./t5/checkpoint-1368/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1368/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1368/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1368/spiece.model
Deleting older checkpoint [t5/checkpoint-1254] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1482
Configuration saved in ./t5/checkpoint-1482/config.json
Configuration saved in ./t5/checkpoint-1482/config.json
Model weights saved in ./t5/checkpoint-1482/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1482/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1482/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1482/spiece.model
Deleting older checkpoint [t5/checkpoint-1368] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1596
Configuration saved in ./t5/checkpoint-1596/config.json
Configuration saved in ./t5/checkpoint-1596/config.json
Model weights saved in ./t5/checkpoint-1596/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1596/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1596/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1596/spiece.model
Deleting older checkpoint [t5/checkpoint-1482] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1710
Configuration saved in ./t5/checkpoint-1710/config.json
Model weights saved in ./t5/checkpoint-1710/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1710/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1710/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1710/spiece.model
Deleting older checkpoint [t5/checkpoint-1596] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1824
Configuration saved in ./t5/checkpoint-1824/config.json
Configuration saved in ./t5/checkpoint-1824/config.json
Model weights saved in ./t5/checkpoint-1824/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1824/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1824/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1824/spiece.model
Deleting older checkpoint [t5/checkpoint-1710] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-1938
Configuration saved in ./t5/checkpoint-1938/config.json
Configuration saved in ./t5/checkpoint-1938/config.json
Model weights saved in ./t5/checkpoint-1938/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-1938/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-1938/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-1938/spiece.model
Deleting older checkpoint [t5/checkpoint-1824] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-2052
Configuration saved in ./t5/checkpoint-2052/config.json
Configuration saved in ./t5/checkpoint-2052/config.json
Model weights saved in ./t5/checkpoint-2052/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-2052/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-2052/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-2052/spiece.model
Deleting older checkpoint [t5/checkpoint-1938] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-2166
Configuration saved in ./t5/checkpoint-2166/config.json
Configuration saved in ./t5/checkpoint-2166/config.json
Model weights saved in ./t5/checkpoint-2166/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-2166/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-2166/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-2166/spiece.model
Deleting older checkpoint [t5/checkpoint-2052] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: task. If task are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32


Saving model checkpoint to ./t5/checkpoint-2280
Configuration saved in ./t5/checkpoint-2280/config.json
Configuration saved in ./t5/checkpoint-2280/config.json
Model weights saved in ./t5/checkpoint-2280/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-2280/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-2280/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-2280/spiece.model
Deleting older checkpoint [t5/checkpoint-2166] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./t5/checkpoint-456 (score: 0.5948493906645206).


TrainOutput(global_step=2280, training_loss=0.008247792086841767, metrics={'train_runtime': 884.8347, 'train_samples_per_second': 81.914, 'train_steps_per_second': 2.577, 'total_flos': 8965382543769600.0, 'train_loss': 0.008247792086841767, 'epoch': 20.0})