In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
import pandas as pd

In [3]:
import sys
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
sys.path.append("../../src/")
import data_utils

  from .autonotebook import tqdm as notebook_tqdm


# Dataset Utilities

In [4]:
peng_dir = dict(
    lap14 = "../../data/absa/en/peng/14lap",
    res14 = "../../data/absa/en/peng/14res",
    res15 = "../../data/absa/en/peng/15res",
    res16 = "../../data/absa/en/peng/16res"
)

wan_dir = dict(
    res15 = "../../data/absa/en/wan/interim/rest15",
    res16 = "../../data/absa/en/wan/interim/rest16"
)
    
zhang_dir = dict(
    res15 = "../../data/absa/en/zhang/interim/interim_2/rest15",
    res16 = "../../data/absa/en/zhang/interim/interim_2/rest16"
)

william_dir = dict(
    hotel = "../../data/absa/id/william"
)

peng = dict(
    lap14 = dict(
        train = data_utils.read_data(path=peng_dir["lap14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["lap14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["lap14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res14 = dict(
        train = data_utils.read_data(path=peng_dir["res14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res15 = dict(
        train = data_utils.read_data(path=peng_dir["res15"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res15"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res15"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res16 = dict(
        train = data_utils.read_data(path=peng_dir["res16"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res16"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res16"] + "/test_triplets.txt",
                                     target_format="aos")
    )
)

wan = dict(
    res15 = dict(
        train = data_utils.read_data(path=wan_dir["res15"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res15"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res15"] + "/test.txt",
                                     target_format="acs")
    ),
    res16 = dict(
        train = data_utils.read_data(path=wan_dir["res16"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res16"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res16"] + "/test.txt",
                                     target_format="acs")
    )
)

zhang = dict(
    res15 = dict(
        train = data_utils.read_data(path=zhang_dir["res15"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res15"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res15"] + "/test.txt",
                                     target_format="acso")
    ),
    res16 = dict(
        train = data_utils.read_data(path=zhang_dir["res16"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res16"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res16"] + "/test.txt",
                                     target_format="acso")
    )
)

william = dict(
    hotel = dict(
        train = data_utils.read_data(path=william_dir["hotel"] + "/train.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=william_dir["hotel"] + "/dev.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=william_dir["hotel"] + "/test.txt",
                                     target_format="aos")
    )
)

# Data Preprocessing 1

In [5]:
data_utils.SENTIMENT_ELEMENT = {'a' : "aspect", 'o' : "opinion", 's' : "sentiment", 'c' : "category"}

1. AOS (ASTE)
    * AO
    * AS
    * A
    * O

2. ACS (TASD)
    * AS
    * CS
    * A
    * C

3. ACOS
    * AO
    * AS
    * CS
    * A
    * O
    * C

In [6]:
task_tree = {
    "oas" : ["oa","as",'a','o'],
    "asc" : ["as","sc",'a','c'],
    "oasc" : ["oa","as","sc",'a','o','c']
}

all_task = []
for k,v1 in task_tree.items():
    if k not in all_task:
        all_task.append(k)
    for v2 in v1:
        if v2 not in all_task:
            all_task.append(v2)

print(all_task)

['oas', 'oa', 'as', 'a', 'o', 'asc', 'sc', 'c', 'oasc']


In [7]:
data_utils.remove_duplicate_targets(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"ao"))

[{'aspect': 'battery life', 'opinion': 'good'}]

Handle mix may not be a must, but we'll see it later. Will be problematic if like as (UABSA / E2E ABSA) used for training AOS (ASTE) --> may be for further experiment because we will insert imputing later on

In [8]:
data_utils.handle_mix_sentiment(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"aos"))

[{'aspect': 'battery life', 'opinion': 'good', 'sentiment': 'mixed'}]

In [9]:
from copy import deepcopy

# Peng (ASTE/AOS)
peng_intermediate = dict()

for domain, v1 in peng.items():
    peng_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        peng_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = peng[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            peng_intermediate[domain][task][split] = ds_copy

In [10]:
# Wan (TASD/ACS)
wan_intermediate = dict()

for domain, v1 in wan.items():
    wan_intermediate[domain] = dict()
    for task in ["asc"] + task_tree["asc"]:
        wan_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = wan[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            wan_intermediate[domain][task][split] = ds_copy

In [11]:
# Zhang (ACOS)
zhang_intermediate = dict()

for domain, v1 in zhang.items():
    zhang_intermediate[domain] = dict()
    for task in ["oasc"] + task_tree["oasc"]:
        zhang_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = zhang[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            zhang_intermediate[domain][task][split] = ds_copy

In [12]:
# William (AOS ID)
william_intermediate = dict()

for domain, v1 in william.items():
    william_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        william_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = william[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            william_intermediate[domain][task][split] = ds_copy

# Answer Engineering

In [13]:
mask = "<extra_id_X>"

In [14]:
def construct_answer(targets,se_order):
    result = []
    counter = 0
    for t in targets:
        constructed_t = ""
        for se in se_order:
            if counter > 99:
                raise Exception("Extra id more than 99!")
            constructed_t += ' ' + mask.replace('X',str(counter)) + ' ' + t[data_utils.SENTIMENT_ELEMENT[se]]
            counter += 1
        constructed_t = constructed_t.strip()
        result.append(constructed_t)
    result = " ; ".join(result)
    return result

In [15]:
construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")

'<extra_id_0> no <extra_id_1> GUI <extra_id_2> negative ; <extra_id_3> dark <extra_id_4> screen <extra_id_5> negative ; <extra_id_6> steady <extra_id_7> power light <extra_id_8> neutral ; <extra_id_9> steady <extra_id_10> hard drive light <extra_id_11> negative'

# Prompt Engineering

In [16]:
def construct_prompt(text,se_order):
    prompt = []
    for counter, se in enumerate(se_order):
        prompt.append(data_utils.SENTIMENT_ELEMENT[se] + " : " + mask.replace('X',str(counter)))
    prompt = " ,".join(prompt)
    result = text + "| " + prompt
    return result

In [17]:
construct_prompt(peng_intermediate["lap14"]["oas"]["train"][4]["text"],"oas")

'One night I turned the freaking thing off after using it , the next day I turn it on , no GUI , screen all dark , power light steady , hard drive light steady and not flashing as it usually does .| opinion : <extra_id_0> ,aspect : <extra_id_1> ,sentiment : <extra_id_2>'

# Answer Catch

In [18]:
import re

def catch_answer(output,se_order):
    output = output.replace("<pad>",'')
    output = output.replace("</s>",'')
    pattern = r""
    for se in se_order:
        if se != 's':
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*"
        else:
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*"
    found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
    for i in range(len(found)):
        for k, v in found[i].items():
            found[i][k] = found[i][k].strip()
    return found

In [19]:
output = construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")
se_order = "oas"
catch_answer(output,se_order)

[{'opinion': 'no', 'aspect': 'GUI', 'sentiment': 'negative'},
 {'opinion': 'dark', 'aspect': 'screen', 'sentiment': 'negative'},
 {'opinion': 'steady', 'aspect': 'power light', 'sentiment': 'neutral'},
 {'opinion': 'steady', 'aspect': 'hard drive light', 'sentiment': 'negative'}]

In [20]:
output

'<extra_id_0> no <extra_id_1> GUI <extra_id_2> negative ; <extra_id_3> dark <extra_id_4> screen <extra_id_5> negative ; <extra_id_6> steady <extra_id_7> power light <extra_id_8> neutral ; <extra_id_9> steady <extra_id_10> hard drive light <extra_id_11> negative'

# Data Preprocessing 2

In [21]:
from datasets import Dataset

peng_2 = dict()
for domain, v1 in peng_intermediate.items():
    peng_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in peng_intermediate[domain][basic_task]["train"]:
            peng_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in peng_intermediate[domain]["oas"]["val"]:
        peng_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in peng_intermediate[domain]["oas"]["test"]:
        peng_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    peng_2[domain]["train"] = Dataset.from_list(peng_2[domain]["train"])
    peng_2[domain]["val"] = Dataset.from_list(peng_2[domain]["val"])
    peng_2[domain]["test"] = Dataset.from_list(peng_2[domain]["test"])

wan_2 = dict()
for domain, v1 in wan_intermediate.items():
    wan_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["asc"]:
        for el in wan_intermediate[domain][basic_task]["train"]:
            wan_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in wan_intermediate[domain]["asc"]["val"]:
        wan_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    # TEST
    for el in wan_intermediate[domain]["asc"]["test"]:
        wan_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    wan_2[domain]["train"] = Dataset.from_list(wan_2[domain]["train"])
    wan_2[domain]["val"] = Dataset.from_list(wan_2[domain]["val"])
    wan_2[domain]["test"] = Dataset.from_list(wan_2[domain]["test"])

zhang_2 = dict()
for domain, v1 in zhang_intermediate.items():
    zhang_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oasc"]:
        for el in zhang_intermediate[domain][basic_task]["train"]:
            zhang_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in zhang_intermediate[domain]["oasc"]["val"]:
        zhang_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    # TEST
    for el in zhang_intermediate[domain]["oasc"]["test"]:
        zhang_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    zhang_2[domain]["train"] = Dataset.from_list(zhang_2[domain]["train"])
    zhang_2[domain]["val"] = Dataset.from_list(zhang_2[domain]["val"])
    zhang_2[domain]["test"] = Dataset.from_list(zhang_2[domain]["test"])

william_2 = dict()
for domain, v1 in william_intermediate.items():
    william_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in william_intermediate[domain][basic_task]["train"]:
            william_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in william_intermediate[domain]["oas"]["val"]:
        william_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in william_intermediate[domain]["oas"]["test"]:
        william_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    william_2[domain]["train"] = Dataset.from_list(william_2[domain]["train"])
    william_2[domain]["val"] = Dataset.from_list(william_2[domain]["val"])
    william_2[domain]["test"] = Dataset.from_list(william_2[domain]["test"])

In [22]:
william_2["hotel"]["train"][69]

{'input': 'tempat yag bagus dan nyaman untuk istirahat tetapi tolong tvnya perlu di perbaiki channelnya karena banyak semutnya digambar dan water heaternya tidak bisa jadi mandi air dingin terus .| opinion : <extra_id_0> ,aspect : <extra_id_1>',
 'output': '<extra_id_0> bagus <extra_id_1> tempat ; <extra_id_2> nyaman <extra_id_3> tempat ; <extra_id_4> perlu di perbaiki <extra_id_5> tvnya ; <extra_id_6> tidak bisa <extra_id_7> water heaternya',
 'task': 'oa'}

# Prepare Tokenized Dataset

## English

In [23]:
tokenizer_en = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [24]:
encoding_args = {
    "max_length" : 512,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

In [25]:
def encode_en(dataset):
    result = tokenizer_en(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [26]:
peng_tok = dict()
for domain, v1 in peng_2.items():
    peng_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            peng_tok[domain][split] = peng_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            peng_tok[domain][split] = encode_en(peng_2[domain][split])

                                                                 

In [27]:
wan_tok = dict()
for domain, v1 in wan_2.items():
    wan_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            wan_tok[domain][split] = wan_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            wan_tok[domain][split] = encode_en(wan_2[domain][split])

                                                                 

In [28]:
zhang_tok = dict()
for domain, v1 in zhang_2.items():
    zhang_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            zhang_tok[domain][split] = zhang_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            zhang_tok[domain][split] = encode_en(zhang_2[domain][split])

                                                                 

## Indo

In [29]:
tokenizer_id = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base")

In [30]:
def encode_id(dataset):
    result = tokenizer_id(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [31]:
william_tok = dict()
for domain, v1 in william_2.items():
    william_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            william_tok[domain][split] = william_2[domain][split].map(encode_id,batched=True,remove_columns=["input","output","task"])
        else:
            william_tok[domain][split] = encode_id(william_2[domain][split])

                                                                   

# Data Collator

## English

In [32]:
from transformers import DataCollatorForSeq2Seq

data_collator_en = DataCollatorForSeq2Seq(tokenizer=tokenizer_en)

## Indo

In [33]:
data_collator_id = DataCollatorForSeq2Seq(tokenizer=tokenizer_id)

# Compute Metrics

In [34]:
from transformers import EvalPrediction
from evaluation import recall, precision, f1_score, summary_score
from typing import List, Dict, Tuple
import numpy as np

def seperate_target_prediction_per_task(predictions:List[List[Dict]],targets:List[List[Dict]],tasks:List) -> Tuple[Dict[str,List],Dict[str,List]]:
    per_task_targets = {}
    per_task_predictions = {}
    for target, prediction, task in zip(targets,predictions,tasks):
        if task not in per_task_targets.keys():
            per_task_targets[task] = []
        if task not in per_task_predictions.keys():
            per_task_predictions[task] = []
        per_task_targets[task].append(target)
        per_task_predictions[task].append(prediction)
    return per_task_targets, per_task_predictions

def preprocess_eval_preds(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer):
    input_ids = eval_preds.inputs
    target_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions

    # In case the model returns more than the prediction logits
    if isinstance(input_ids, tuple):
        input_ids = input_ids[0]
    if isinstance(target_ids, tuple):
        target_ids = target_ids[0]
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    input_ids = np.argmax(input_ids,axis=-1) if len(input_ids.shape) == 3 else input_ids # in case not predict with generate
    target_ids = np.argmax(target_ids,axis=-1) if len(target_ids.shape) == 3 else target_ids # in case not predict with generate
    prediction_ids = np.argmax(pred_ids,axis=-1) if len(pred_ids.shape) == 3 else pred_ids # in case not predict with generate

    input_ids = [[token for token in row if token != -100] for row in input_ids]
    target_ids = [[token for token in row if token != -100] for row in target_ids]
    prediction_ids = [[token for token in row if token != -100] for row in prediction_ids]

    inputs = tokenizer.batch_decode(input_ids,**decoding_args)
    targets = tokenizer.batch_decode(target_ids,**decoding_args)
    predictions = tokenizer.batch_decode(prediction_ids,**decoding_args)

    return inputs, targets, predictions

def compute_metrics(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer,tasks:List) -> Dict[str,float]: # MAY NOT BE SUFFICIATE FOR CAUSAL LM
        """
        ### DESC
            Method to compute the metrics.
        ### PARAMS
        * eval_preds: EvalPrediction instance from training.
        * decoding_args: Decoding arguments.
        ### RETURN
        * metrics: Dictionary of metrics.
        """
        inputs, targets, predictions = preprocess_eval_preds(eval_preds,decoding_args,tokenizer)

        targets = [catch_answer(text,task) for text,task in zip(targets,tasks) if task != "non_absa"]
        predictions = [catch_answer(text,task) for text,task in zip(predictions,tasks) if task != "non_absa"]


        per_task_targets, per_task_predictions = seperate_target_prediction_per_task(predictions, targets, tasks)
        
        metrics = {}

        metrics["overall_recall"] = recall(predictions,targets)
        metrics["overall_precision"] = precision(predictions,targets)
        metrics["overall_f1_score"] = f1_score(predictions,targets)

        for task in per_task_targets.keys():
            if task == "non_absa":
                continue
            metrics[f"{task}_recall"] = recall(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_precision"] = precision(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_f1_score"] = f1_score(per_task_predictions[task],per_task_targets[task])
        
        return metrics

# Train Arguments

In [35]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 20,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./t5",
    "logging_dir" : "./t5/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

# Model

## English

In [36]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

## Indo

In [37]:
model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base")

# Train

In [38]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [39]:
from transformers import Seq2SeqTrainer

# trainer = {
#     "peng" : {},
#     "wan" : {},
#     "zhang" : {},
#     "william" : {}
# }

decoding_args = {
    "skip_special_tokens" : False
}

def preprocess_logits_for_metrics(logits, targets):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, targets

In [40]:
from tqdm import tqdm

def generate_predictions(model,tokenizer,tokenized:torch.Tensor,device:torch.device=torch.device("cpu"),batch_size:int=16,max_len:int=512,decoding_args:Dict={}) -> List[str]:
    # Data loader
    input_ids_data_loader = torch.utils.data.DataLoader(tokenized["input_ids"],
                        batch_size=batch_size,shuffle=False)
    attention_mask_data_loader = torch.utils.data.DataLoader(tokenized["attention_mask"],
                        batch_size=batch_size,shuffle=False)
    # Predict
    model = model
    tokenizer = tokenizer
    tensor_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(zip(input_ids_data_loader,attention_mask_data_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            tensor_predictions.extend(model.generate(input_ids=input_ids,attention_mask=attention_mask,max_length=max_len,pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id).cpu())
            input_ids = input_ids.cpu()
            attention_mask = attention_mask.cpu()
    tensor_predictions = [[token for token in row if token != -100] for row in tensor_predictions]
    predictions = tokenizer.batch_decode(tensor_predictions,**decoding_args)
    return predictions

In [51]:
import json

def save_result(str_preds_,preds,targets,filename):
    result = []
    str_preds = [el.replace("<pad>",'').replace("</s>",'') for el in str_preds_]
    assert len(str_preds) == len(preds) == len(targets)
    for i in range(len(str_preds)):
        result.append({
            "str_pred" : str_preds[i],
            "pred" : preds[i],
            "target" : targets[i]
        })
    
    with open(filename,'w') as fp:
        json.dump(result,fp)
    return result

# Peng Laptop 2014

In [41]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["lap14"]["train"],
        eval_dataset = peng_tok["lap14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["lap14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 3624
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2280
  Number of trainable parameters = 222903552
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
  Num examples = 219
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-114
Configuration saved in ./t5/checkpoint-114/config.json
Model weights saved in ./t5/checkpoint-114/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-114/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-114/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-114/spiece.model
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-228
Configuration saved in ./t5/checkpoint-228/config.json
Model weights saved in ./t5/checkpoint-228/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-228/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-228/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-228/spiece.model
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
Saving model checkpoint to ./

TrainOutput(global_step=2280, training_loss=0.03503607097817095, metrics={'train_runtime': 982.6653, 'train_samples_per_second': 73.759, 'train_steps_per_second': 2.32, 'total_flos': 8965382543769600.0, 'train_loss': 0.03503607097817095, 'epoch': 20.0})

In [42]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["lap14"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:08,  1.29it/s]


In [43]:
targets = [catch_answer(el,"oas") for el in peng_2["lap14"]["test"]["output"]]

In [44]:
summary_score(preds,targets)

{'recall': 0.5027726432532348,
 'precision': 0.6570048309178744,
 'f1_score': 0.5696335078534032}

In [45]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [46]:
del model
torch.cuda.empty_cache()

In [50]:
targets

[[{'opinion': 'fast', 'aspect': 'Boot time', 'sentiment': 'positive'}],
 [{'opinion': 'not fix', 'aspect': 'tech support', 'sentiment': 'negative'}],
 [{'opinion': 'easy', 'aspect': 'Set up', 'sentiment': 'positive'}],
 [{'opinion': 'not enjoy', 'aspect': 'Windows 8', 'sentiment': 'negative'},
  {'opinion': 'not enjoy',
   'aspect': 'touchscreen functions',
   'sentiment': 'negative'}],
 [{'opinion': 'lousy',
   'aspect': 'internal speakers',
   'sentiment': 'negative'}],
 [{'opinion': 'fast', 'aspect': 'use', 'sentiment': 'positive'},
  {'opinion': 'light', 'aspect': 'use', 'sentiment': 'positive'},
  {'opinion': 'simple', 'aspect': 'use', 'sentiment': 'positive'}],
 [{'opinion': 'well', 'aspect': 'Works', 'sentiment': 'positive'},
  {'opinion': 'happy', 'aspect': 'apple OS', 'sentiment': 'positive'}],
 [{'opinion': 'not light and slim',
   'aspect': 'features',
   'sentiment': 'positive'}],
 [{'opinion': 'pleased', 'aspect': 'log on', 'sentiment': 'positive'},
  {'opinion': 'fast', '

In [None]:
result = save_result(str_preds, preds, targets, "peng_lap14.json")

# Peng Restaurant 2014

In [48]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res14"]["train"],
        eval_dataset = peng_tok["res14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 310
  Batch size = 32
  Num examples = 310
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-159
Configuration saved in ./t5/checkpoint-159/config.json
Model weights saved in ./t5/checkpoint-159/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-159/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-159/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-159/spiece.model
***** Running Evaluation *****
  Num examples = 310
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-318
Configuration saved in ./t5/checkpoint-318/config.json
Model weights saved in ./t5/checkpoint-318/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-318/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-318/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-318/spiece.model
***** Running Evaluation *****
  Num examples = 310
  Batch size = 32
Saving model checkpoint to ./

TrainOutput(global_step=3180, training_loss=0.02443723297156628, metrics={'train_runtime': 1663.3771, 'train_samples_per_second': 60.888, 'train_steps_per_second': 1.912, 'total_flos': 1.397309066698752e+16, 'train_loss': 0.02443723297156628, 'epoch': 20.0})

In [49]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res14"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

16it [00:17,  1.10s/it]


In [50]:
targets = [catch_answer(el,"oas") for el in peng_2["res14"]["test"]["output"]]

In [51]:
summary_score(preds,targets)

{'recall': 0.4305835010060362,
 'precision': 0.7210084033613445,
 'f1_score': 0.5391742012021976}

In [52]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "peng_res14.json")

# Peng Restaurant 2015

In [53]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res15"]["train"],
        eval_dataset = peng_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 148
  Batch size = 32
  Num examples = 148
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-76
Configuration saved in ./t5/checkpoint-76/config.json
Model weights saved in ./t5/checkpoint-76/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-76/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-76/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-76/spiece.model
***** Running Evaluation *****
  Num examples = 148
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-152
Configuration saved in ./t5/checkpoint-152/config.json
Model weights saved in ./t5/checkpoint-152/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-152/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-152/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-152/spiece.model
***** Running Evaluation *****
  Num examples = 148
  Batch size = 32
Saving model checkpoint to ./t5/che

TrainOutput(global_step=1520, training_loss=0.03920878225104197, metrics={'train_runtime': 766.7602, 'train_samples_per_second': 63.123, 'train_steps_per_second': 1.982, 'total_flos': 7138128273408000.0, 'train_loss': 0.03920878225104197, 'epoch': 20.0})

In [54]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:09,  1.17it/s]


In [55]:
targets = [catch_answer(el,"oas") for el in peng_2["res15"]["test"]["output"]]

In [56]:
summary_score(preds,targets)

{'recall': 0.5298969072164949,
 'precision': 0.6277372262773723,
 'f1_score': 0.5746824581702831}

In [57]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "peng_res15.json")

# Peng Restaurant 2016

In [58]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res16"]["train"],
        eval_dataset = peng_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 210
  Batch size = 32
  Num examples = 210
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-108
Configuration saved in ./t5/checkpoint-108/config.json
Model weights saved in ./t5/checkpoint-108/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-108/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-108/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-108/spiece.model
***** Running Evaluation *****
  Num examples = 210
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-216
Configuration saved in ./t5/checkpoint-216/config.json
Model weights saved in ./t5/checkpoint-216/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-216/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-216/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-216/spiece.model
***** Running Evaluation *****
  Num examples = 210
  Batch size = 32
Saving model checkpoint to ./

TrainOutput(global_step=2160, training_loss=0.030442173609992972, metrics={'train_runtime': 1011.8291, 'train_samples_per_second': 67.758, 'train_steps_per_second': 2.135, 'total_flos': 1.011127478188032e+16, 'train_loss': 0.030442173609992972, 'epoch': 20.0})

In [59]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:12,  1.12s/it]


In [60]:
targets = [catch_answer(el,"oas") for el in peng_2["res16"]["test"]["output"]]

In [61]:
summary_score(preds,targets)

{'recall': 0.603112840466926,
 'precision': 0.6813186813186813,
 'f1_score': 0.6398348813209495}

In [62]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "peng_res16.json")

# Wan Restaurant 2015

In [63]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = wan_tok["res15"]["train"],
        eval_dataset = wan_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
  Num examples = 10
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-140
Configuration saved in ./t5/checkpoint-140/config.json
Model weights saved in ./t5/checkpoint-140/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-140/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-140/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-140/spiece.model
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-280
Configuration saved in ./t5/checkpoint-280/config.json
Model weights saved in ./t5/checkpoint-280/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-280/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-280/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-280/spiece.model
***** Running Evaluation *****
  Num examples = 10
  Batch size = 32
Saving model checkpoint to ./t5/c

TrainOutput(global_step=2800, training_loss=0.02643859689789159, metrics={'train_runtime': 1378.2213, 'train_samples_per_second': 65.011, 'train_steps_per_second': 2.032, 'total_flos': 1.438663016448e+16, 'train_loss': 0.02643859689789159, 'epoch': 20.0})

In [64]:
str_preds = generate_predictions(model, tokenizer_en, wan_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"asc") for el in str_preds]

19it [00:08,  2.12it/s]


In [65]:
targets = [catch_answer(el,"asc") for el in wan_2["res15"]["test"]["output"]]

In [66]:
summary_score(preds,targets)

{'recall': 0.2875739644970414,
 'precision': 0.5560640732265446,
 'f1_score': 0.3790951638065523}

In [67]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "wan_res15.json")

# Wan Restaurant 2016

In [68]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = wan_tok["res16"]["train"],
        eval_dataset = wan_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 29
  Batch size = 32
  Num examples = 29
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-214
Configuration saved in ./t5/checkpoint-214/config.json
Saving model checkpoint to ./t5/checkpoint-214
Configuration saved in ./t5/checkpoint-214/config.json
Model weights saved in ./t5/checkpoint-214/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-214/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-214/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-214/spiece.model
***** Running Evaluation *****
  Num examples = 29
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-428
Configuration saved in ./t5/checkpoint-428/config.json
Model weights saved in ./t5/checkpoint-428/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-428/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-428/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-428/spiece.model


TrainOutput(global_step=4280, training_loss=0.019733498087065798, metrics={'train_runtime': 2028.5889, 'train_samples_per_second': 67.357, 'train_steps_per_second': 2.11, 'total_flos': 2.1939611000832e+16, 'train_loss': 0.019733498087065798, 'epoch': 20.0})

In [69]:
str_preds = generate_predictions(model, tokenizer_en, wan_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"asc") for el in str_preds]

19it [00:11,  1.70it/s]


In [70]:
targets = [catch_answer(el,"asc") for el in wan_2["res16"]["test"]["output"]]

In [71]:
summary_score(preds,targets)

{'recall': 0.38766006984866125,
 'precision': 0.6121323529411765,
 'f1_score': 0.4746970776906629}

In [72]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "wan_res16.json")

# Zhang Restaurant 2015

In [73]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = zhang_tok["res15"]["train"],
        eval_dataset = zhang_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 209
  Batch size = 32
  Num examples = 209
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-157
Configuration saved in ./t5/checkpoint-157/config.json
Model weights saved in ./t5/checkpoint-157/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-157/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-157/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-157/spiece.model
***** Running Evaluation *****
  Num examples = 209
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-314
Configuration saved in ./t5/checkpoint-314/config.json
Model weights saved in ./t5/checkpoint-314/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-314/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-314/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-314/spiece.model
***** Running Evaluation *****
  Num examples = 209
  Batch size = 32
Saving model checkpoint to ./

TrainOutput(global_step=3140, training_loss=0.02554370107212264, metrics={'train_runtime': 1304.0701, 'train_samples_per_second': 76.744, 'train_steps_per_second': 2.408, 'total_flos': 1.08319336280064e+16, 'train_loss': 0.02554370107212264, 'epoch': 20.0})

In [74]:
str_preds = generate_predictions(model, tokenizer_en, zhang_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oasc") for el in str_preds]

17it [00:13,  1.26it/s]


In [75]:
targets = [catch_answer(el,"oasc") for el in zhang_2["res15"]["test"]["output"]]

In [76]:
summary_score(preds,targets)

{'recall': 0.11949685534591195,
 'precision': 0.37109375,
 'f1_score': 0.1807802093244529}

In [77]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "zhang_res15.json")

# Zhang Restaurant 2016

In [78]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = zhang_tok["res16"]["train"],
        eval_dataset = zhang_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 316
  Batch size = 32
  Num examples = 316
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-237
Configuration saved in ./t5/checkpoint-237/config.json
Model weights saved in ./t5/checkpoint-237/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-237/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-237/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-237/spiece.model
***** Running Evaluation *****
  Num examples = 316
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-474
Configuration saved in ./t5/checkpoint-474/config.json
Model weights saved in ./t5/checkpoint-474/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-474/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-474/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-474/spiece.model
***** Running Evaluation *****
  Num examples = 316
  Batch size = 32
Saving model checkpoint to ./

TrainOutput(global_step=4740, training_loss=0.01968533838568609, metrics={'train_runtime': 2346.1284, 'train_samples_per_second': 64.651, 'train_steps_per_second': 2.02, 'total_flos': 2.4354509635584e+16, 'train_loss': 0.01968533838568609, 'epoch': 20.0})

In [79]:
str_preds = generate_predictions(model, tokenizer_en, zhang_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oasc") for el in str_preds]

17it [00:11,  1.54it/s]


In [80]:
targets = [catch_answer(el,"oasc") for el in zhang_2["res16"]["test"]["output"]]

In [81]:
summary_score(preds,targets)

{'recall': 0.2640801001251564,
 'precision': 0.4678492239467849,
 'f1_score': 0.3376}

In [82]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "zhang_res16.json")

# William Hotel

In [83]:
model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_id,
        data_collator = data_collator_en,
        train_dataset = william_tok["hotel"]["train"],
        eval_dataset = william_tok["hotel"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,william_2["hotel"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--Wikidepia--IndoT5-base/snapshots/da8e5576aff97b6e6e08ffa669e34bbf87ca637c/config.json
Model config T5Config {
  "_name_or_path": "Wikidepia/IndoT5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.24.0",
  "use_cache": true,
  "vocab_size": 32128
}

loading weights fil

Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oas Recall,Oas Precision,Oas F1 Score
1,3.2002,0.094703,0,0,0,0,0,0


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-375
Configuration saved in ./t5/checkpoint-375/config.json
Saving model checkpoint to ./t5/checkpoint-375
Configuration saved in ./t5/checkpoint-375/config.json
Model weights saved in ./t5/checkpoint-375/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-375/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-375/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-375/spiece.model
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-750
Configuration saved in ./t5/checkpoint-750/config.json
Model weights saved in ./t5/checkpoint-750/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-750/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-750/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-750/spiece.

TrainOutput(global_step=7500, training_loss=0.164492866435647, metrics={'train_runtime': 5288.9667, 'train_samples_per_second': 45.377, 'train_steps_per_second': 1.418, 'total_flos': 4.778348078451917e+16, 'train_loss': 0.164492866435647, 'epoch': 20.0})

In [84]:
str_preds = generate_predictions(model, tokenizer_id, william_tok["hotel"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

32it [01:29,  2.79s/it]


In [85]:
targets = [catch_answer(el,"oas") for el in william_2["hotel"]["test"]["output"]]

In [86]:
summary_score(preds,targets)

{'recall': 0.3274270948689553,
 'precision': 0.7598290598290598,
 'f1_score': 0.4576449083904052}

In [87]:
!rm -rf ./t5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "william_hotel.json")