In [1]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "2,6"
n_gpu = torch.cuda.device_count()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
import sys
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
sys.path.append("../../../src/")
import data_utils

# Dataset Utilities

In [4]:
peng_dir = dict(
    lap14 = "../../../data/absa/en/peng/14lap",
    res14 = "../../../data/absa/en/peng/14res",
    res15 = "../../../data/absa/en/peng/15res",
    res16 = "../../../data/absa/en/peng/16res"
)

wan_dir = dict(
    res15 = "../../../data/absa/en/wan/interim/rest15",
    res16 = "../../../data/absa/en/wan/interim/rest16"
)
    
zhang_dir = dict(
    res15 = "../../../data/absa/en/zhang/interim/interim_2/rest15",
    res16 = "../../../data/absa/en/zhang/interim/interim_2/rest16"
)

william_dir = dict(
    hotel = "../../../data/absa/id/william"
)

peng = dict(
    lap14 = dict(
        train = data_utils.read_data(path=peng_dir["lap14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["lap14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["lap14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res14 = dict(
        train = data_utils.read_data(path=peng_dir["res14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res15 = dict(
        train = data_utils.read_data(path=peng_dir["res15"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res15"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res15"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res16 = dict(
        train = data_utils.read_data(path=peng_dir["res16"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res16"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res16"] + "/test_triplets.txt",
                                     target_format="aos")
    )
)

wan = dict(
    res15 = dict(
        train = data_utils.read_data(path=wan_dir["res15"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res15"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res15"] + "/test.txt",
                                     target_format="acs")
    ),
    res16 = dict(
        train = data_utils.read_data(path=wan_dir["res16"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res16"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res16"] + "/test.txt",
                                     target_format="acs")
    )
)

zhang = dict(
    res15 = dict(
        train = data_utils.read_data(path=zhang_dir["res15"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res15"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res15"] + "/test.txt",
                                     target_format="acso")
    ),
    res16 = dict(
        train = data_utils.read_data(path=zhang_dir["res16"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res16"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res16"] + "/test.txt",
                                     target_format="acso")
    )
)

william = dict(
    hotel = dict(
        train = data_utils.read_data(path=william_dir["hotel"] + "/train.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=william_dir["hotel"] + "/dev.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=william_dir["hotel"] + "/test.txt",
                                     target_format="aos")
    )
)

# Data Preprocessing 1

In [5]:
data_utils.SENTIMENT_ELEMENT = {'a' : "aspect", 'o' : "opinion", 's' : "sentiment", 'c' : "category"}

1. AOS (ASTE)
    * AO
    * AS
    * A
    * O

2. ACS (TASD)
    * AS
    * CS
    * A
    * C

3. ACOS
    * AO
    * AS
    * CS
    * A
    * O
    * C

In [6]:
task_tree = {
    "oas" : ["oas","oa","as",'a','o'],
    "asc" : ["asc","as","sc",'a','c'],
    "oasc" : ["oasc","oa","as","sc",'a','o','c']
}

all_task = []
for k,v1 in task_tree.items():
    if k not in all_task:
        all_task.append(k)
    for v2 in v1:
        if v2 not in all_task:
            all_task.append(v2)

print(all_task)

['oas', 'oa', 'as', 'a', 'o', 'asc', 'sc', 'c', 'oasc']


In [7]:
data_utils.remove_duplicate_targets(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"ao"))

[{'aspect': 'battery life', 'opinion': 'good'}]

Handle mix may not be a must, but we'll see it later. Will be problematic if like as (UABSA / E2E ABSA) used for training AOS (ASTE) --> may be for further experiment because we will insert imputing later on

In [8]:
data_utils.handle_mix_sentiment(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"aos"))

[{'aspect': 'battery life', 'opinion': 'good', 'sentiment': 'mixed'}]

In [9]:
from copy import deepcopy

# Peng (ASTE/AOS)
peng_intermediate = dict()

for domain, v1 in peng.items():
    peng_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        peng_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = peng[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            peng_intermediate[domain][task][split] = ds_copy

In [10]:
# Wan (TASD/ACS)
wan_intermediate = dict()

for domain, v1 in wan.items():
    wan_intermediate[domain] = dict()
    for task in ["asc"] + task_tree["asc"]:
        wan_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = wan[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            wan_intermediate[domain][task][split] = ds_copy

In [11]:
# Zhang (ACOS)
zhang_intermediate = dict()

for domain, v1 in zhang.items():
    zhang_intermediate[domain] = dict()
    for task in ["oasc"] + task_tree["oasc"]:
        zhang_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = zhang[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            zhang_intermediate[domain][task][split] = ds_copy

In [12]:
# William (AOS ID)
william_intermediate = dict()

for domain, v1 in william.items():
    william_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        william_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = william[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            william_intermediate[domain][task][split] = ds_copy

# Answer Engineering

In [13]:
mask = "<extra_id_X>"

In [14]:
added_tokens = {
    ',' : "<comma>",
    '(' : "<open_bracket>",
    ')' : "<close_bracket>",
    ';' : "<semicolon>"
}

In [15]:
# def construct_answer(targets,se_order):
#     result = []
#     counter = 0
#     for t in targets:
#         constructed_t = ""
#         for se in se_order:
#             if counter > 99:
#                 raise Exception("Extra id more than 99!")
#             constructed_t += ' ' + mask.replace('X',str(counter)) + ' ' + t[data_utils.SENTIMENT_ELEMENT[se]]
#             counter += 1
#         constructed_t = constructed_t.strip()
#         result.append(constructed_t)
#     result = " ; ".join(result)
#     return result
def construct_answer(targets,se_order):
    result = []
    for t in targets:
        constructed_t = []
        for se in se_order:
            element = t[data_utils.SENTIMENT_ELEMENT[se]]
            for k, v in added_tokens.items():
                element = element.replace(k,v)
            constructed_t.append(element)
        constructed_t = " , ".join(constructed_t)
        constructed_t = f"( {constructed_t} )"
        result.append(constructed_t)
    result = " ; ".join(result)
    return result

In [16]:
construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")

'( no , GUI , negative ) ; ( dark , screen , negative ) ; ( steady , power light , neutral ) ; ( steady , hard drive light , negative )'

In [17]:
construct_answer([{"aspect" : "tes1 , tes2", "opinion" : "( tes3 ; tes4 )", "sentiment" : "positive"}],"oas")

'( <open_bracket> tes3 <semicolon> tes4 <close_bracket> , tes1 <comma> tes2 , positive )'

# Prompt Engineering

In [18]:
# def construct_prompt(text,se_order):
#     prompt = []
#     for counter, se in enumerate(se_order):
#         prompt.append(data_utils.SENTIMENT_ELEMENT[se] + " : " + mask.replace('X',str(counter)))
#     prompt = " ,".join(prompt)
#     result = text + "| " + prompt
#     return result
def construct_prompt(text,se_order):
    prompt = []
    for se in se_order:
        prompt.append(data_utils.SENTIMENT_ELEMENT[se])
    prompt = " , ".join(prompt)
    prompt = f"( {prompt} )"
    masked_text = text
    for k, v in added_tokens.items():
        masked_text = masked_text.replace(k,v)
    result = masked_text + " | " + prompt
    return result

In [19]:
construct_prompt(peng_intermediate["lap14"]["oas"]["train"][4]["text"],"oas")

'One night I turned the freaking thing off after using it <comma> the next day I turn it on <comma> no GUI <comma> screen all dark <comma> power light steady <comma> hard drive light steady and not flashing as it usually does . | ( opinion , aspect , sentiment )'

# Answer Catch

In [20]:
import re

# def catch_answer(output,se_order):
#     output = output.replace("<pad>",'')
#     output = output.replace("</s>",'')
#     pattern = r""
#     for se in se_order:
#         if se != 's':
#             pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*"
#         else:
#             pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*"
#     found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
#     for i in range(len(found)):
#         for k, v in found[i].items():
#             found[i][k] = found[i][k].strip()
#     return found
def catch_answer(output,se_order):
    output = output.replace("<pad>",'')
    output = output.replace("</s>",'')
    pattern = []
    for se in se_order:
        if se != 's':
            pattern.append(f"\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*")
        else:
            pattern.append(f"\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*")
    pattern = ','.join(pattern)
    pattern = f"\({pattern}\)"
    found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
    for i in range(len(found)):
        for k, v in found[i].items():
            found[i][k] = found[i][k].strip()
    return found

In [21]:
output = construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")
se_order = "oas"
catch_answer(output,se_order)

[{'opinion': 'no', 'aspect': 'GUI', 'sentiment': 'negative'},
 {'opinion': 'dark', 'aspect': 'screen', 'sentiment': 'negative'},
 {'opinion': 'steady', 'aspect': 'power light', 'sentiment': 'neutral'},
 {'opinion': 'steady', 'aspect': 'hard drive light', 'sentiment': 'negative'}]

In [22]:
output

'( no , GUI , negative ) ; ( dark , screen , negative ) ; ( steady , power light , neutral ) ; ( steady , hard drive light , negative )'

# Data Preprocessing 2

In [23]:
from datasets import Dataset

peng_2 = dict()
for domain, v1 in peng_intermediate.items():
    peng_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in peng_intermediate[domain][basic_task]["train"]:
            peng_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in peng_intermediate[domain]["oas"]["val"]:
        peng_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in peng_intermediate[domain]["oas"]["test"]:
        peng_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    peng_2[domain]["train"] = Dataset.from_list(peng_2[domain]["train"])
    peng_2[domain]["val"] = Dataset.from_list(peng_2[domain]["val"])
    peng_2[domain]["test"] = Dataset.from_list(peng_2[domain]["test"])

wan_2 = dict()
for domain, v1 in wan_intermediate.items():
    wan_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["asc"]:
        for el in wan_intermediate[domain][basic_task]["train"]:
            wan_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in wan_intermediate[domain]["asc"]["val"]:
        wan_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    # TEST
    for el in wan_intermediate[domain]["asc"]["test"]:
        wan_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    wan_2[domain]["train"] = Dataset.from_list(wan_2[domain]["train"])
    wan_2[domain]["val"] = Dataset.from_list(wan_2[domain]["val"])
    wan_2[domain]["test"] = Dataset.from_list(wan_2[domain]["test"])

zhang_2 = dict()
for domain, v1 in zhang_intermediate.items():
    zhang_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oasc"]:
        for el in zhang_intermediate[domain][basic_task]["train"]:
            zhang_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in zhang_intermediate[domain]["oasc"]["val"]:
        zhang_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    # TEST
    for el in zhang_intermediate[domain]["oasc"]["test"]:
        zhang_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    zhang_2[domain]["train"] = Dataset.from_list(zhang_2[domain]["train"])
    zhang_2[domain]["val"] = Dataset.from_list(zhang_2[domain]["val"])
    zhang_2[domain]["test"] = Dataset.from_list(zhang_2[domain]["test"])

william_2 = dict()
for domain, v1 in william_intermediate.items():
    william_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in william_intermediate[domain][basic_task]["train"]:
            william_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in william_intermediate[domain]["oas"]["val"]:
        william_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in william_intermediate[domain]["oas"]["test"]:
        william_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    william_2[domain]["train"] = Dataset.from_list(william_2[domain]["train"])
    william_2[domain]["val"] = Dataset.from_list(william_2[domain]["val"])
    william_2[domain]["test"] = Dataset.from_list(william_2[domain]["test"])

In [24]:
william_2["hotel"]["train"][69]

{'input': 'tempat yag bagus dan nyaman untuk istirahat tetapi tolong tvnya perlu di perbaiki channelnya karena banyak semutnya digambar dan water heaternya tidak bisa jadi mandi air dingin terus . | ( opinion , aspect , sentiment )',
 'output': '( bagus , tempat , positive ) ; ( nyaman , tempat , positive ) ; ( perlu di perbaiki , tvnya , positive ) ; ( tidak bisa , water heaternya , negative )',
 'task': 'oas'}

# Prepare Tokenized Dataset

## English

In [25]:
tokenizer_en = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [26]:
# tokenizer_en.add_tokens(list(added_tokens.values()))

In [27]:
encoding_args = {
    "max_length" : 512,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

In [28]:
def encode_en(dataset):
    result = tokenizer_en(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [29]:
peng_tok = dict()
for domain, v1 in peng_2.items():
    peng_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            peng_tok[domain][split] = peng_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            peng_tok[domain][split] = encode_en(peng_2[domain][split])

                                                                 

In [30]:
wan_tok = dict()
for domain, v1 in wan_2.items():
    wan_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            wan_tok[domain][split] = wan_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            wan_tok[domain][split] = encode_en(wan_2[domain][split])

                                                                 

In [31]:
zhang_tok = dict()
for domain, v1 in zhang_2.items():
    zhang_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            zhang_tok[domain][split] = zhang_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            zhang_tok[domain][split] = encode_en(zhang_2[domain][split])

                                                                 

## Indo

In [32]:
tokenizer_id = AutoTokenizer.from_pretrained("google/mt5-base")



In [33]:
# tokenizer_id.add_tokens(list(added_tokens.values()))

In [34]:
def encode_id(dataset):
    result = tokenizer_id(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [35]:
william_tok = dict()
for domain, v1 in william_2.items():
    william_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            william_tok[domain][split] = william_2[domain][split].map(encode_id,batched=True,remove_columns=["input","output","task"])
        else:
            william_tok[domain][split] = encode_id(william_2[domain][split])

                                                                   

# Data Collator

## English

In [36]:
from transformers import DataCollatorForSeq2Seq

data_collator_en = DataCollatorForSeq2Seq(tokenizer=tokenizer_en)

## Indo

In [37]:
data_collator_id = DataCollatorForSeq2Seq(tokenizer=tokenizer_id)

# Compute Metrics

In [38]:
from transformers import EvalPrediction
from evaluation import recall, precision, f1_score, summary_score
from typing import List, Dict, Tuple
import numpy as np

def seperate_target_prediction_per_task(predictions:List[List[Dict]],targets:List[List[Dict]],tasks:List) -> Tuple[Dict[str,List],Dict[str,List]]:
    per_task_targets = {}
    per_task_predictions = {}
    for target, prediction, task in zip(targets,predictions,tasks):
        if task not in per_task_targets.keys():
            per_task_targets[task] = []
        if task not in per_task_predictions.keys():
            per_task_predictions[task] = []
        per_task_targets[task].append(target)
        per_task_predictions[task].append(prediction)
    return per_task_targets, per_task_predictions

def preprocess_eval_preds(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer):
    input_ids = eval_preds.inputs
    target_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions

    # In case the model returns more than the prediction logits
    if isinstance(input_ids, tuple):
        input_ids = input_ids[0]
    if isinstance(target_ids, tuple):
        target_ids = target_ids[0]
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    input_ids = np.argmax(input_ids,axis=-1) if len(input_ids.shape) == 3 else input_ids # in case not predict with generate
    target_ids = np.argmax(target_ids,axis=-1) if len(target_ids.shape) == 3 else target_ids # in case not predict with generate
    prediction_ids = np.argmax(pred_ids,axis=-1) if len(pred_ids.shape) == 3 else pred_ids # in case not predict with generate

    input_ids = [[token for token in row if token != -100] for row in input_ids]
    target_ids = [[token for token in row if token != -100] for row in target_ids]
    prediction_ids = [[token for token in row if token != -100] for row in prediction_ids]

    inputs = tokenizer.batch_decode(input_ids,**decoding_args)
    targets = tokenizer.batch_decode(target_ids,**decoding_args)
    predictions = tokenizer.batch_decode(prediction_ids,**decoding_args)

    return inputs, targets, predictions

def compute_metrics(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer,tasks:List) -> Dict[str,float]: # MAY NOT BE SUFFICIATE FOR CAUSAL LM
        """
        ### DESC
            Method to compute the metrics.
        ### PARAMS
        * eval_preds: EvalPrediction instance from training.
        * decoding_args: Decoding arguments.
        ### RETURN
        * metrics: Dictionary of metrics.
        """
        inputs, targets, predictions = preprocess_eval_preds(eval_preds,decoding_args,tokenizer)

        targets = [catch_answer(text,task) for text,task in zip(targets,tasks) if task != "non_absa"]
        predictions = [catch_answer(text,task) for text,task in zip(predictions,tasks) if task != "non_absa"]


        per_task_targets, per_task_predictions = seperate_target_prediction_per_task(predictions, targets, tasks)
        
        metrics = {}

        metrics["overall_recall"] = recall(predictions,targets)
        metrics["overall_precision"] = precision(predictions,targets)
        metrics["overall_f1_score"] = f1_score(predictions,targets)

        for task in per_task_targets.keys():
            if task == "non_absa":
                continue
            metrics[f"{task}_recall"] = recall(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_precision"] = precision(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_f1_score"] = f1_score(per_task_predictions[task],per_task_targets[task])
        
        return metrics

# Train Arguments

In [39]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 20,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 2,
    "per_device_train_batch_size": 16//n_gpu,
    "per_device_eval_batch_size": 16//n_gpu,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./output",
    "logging_dir" : "./output/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

# Train

In [40]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [41]:
from transformers import Seq2SeqTrainer

# trainer = {
#     "peng" : {},
#     "wan" : {},
#     "zhang" : {},
#     "william" : {}
# }

decoding_args = {
    "skip_special_tokens" : False
}

def preprocess_logits_for_metrics(logits, targets):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, targets

In [42]:
from tqdm import tqdm

def generate_predictions(model,tokenizer,tokenized:torch.Tensor,device:torch.device=torch.device("cpu"),batch_size:int=16,max_len:int=512,decoding_args:Dict={}) -> List[str]:
    # Data loader
    input_ids_data_loader = torch.utils.data.DataLoader(tokenized["input_ids"],
                        batch_size=batch_size,shuffle=False)
    attention_mask_data_loader = torch.utils.data.DataLoader(tokenized["attention_mask"],
                        batch_size=batch_size,shuffle=False)
    # Predict
    model = model
    tokenizer = tokenizer
    tensor_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(zip(input_ids_data_loader,attention_mask_data_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            tensor_predictions.extend(model.generate(input_ids=input_ids,attention_mask=attention_mask,max_length=max_len,pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id).cpu())
            input_ids = input_ids.cpu()
            attention_mask = attention_mask.cpu()
    tensor_predictions = [[token for token in row if token != -100] for row in tensor_predictions]
    predictions = tokenizer.batch_decode(tensor_predictions,**decoding_args)
    return predictions

In [43]:
import json

def save_result(str_preds_,preds,targets,filename):
    result = []
    str_preds = [el.replace("<pad>",'').replace("</s>",'') for el in str_preds_]
    assert len(str_preds) == len(preds) == len(targets)
    for i in range(len(str_preds)):
        result.append({
            "str_pred" : str_preds[i],
            "pred" : preds[i],
            "target" : targets[i]
        })
    
    with open(filename,'w') as fp:
        json.dump(result,fp)
    return result

# Peng Laptop 2014

In [46]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["lap14"]["train"],
        eval_dataset = peng_tok["lap14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["lap14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 4530
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2840
  Number of trainable parameters = 222903552
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oas Recall,Oas Precision,Oas F1 Score
1,0.3276,0.059005,0.533333,0.559767,0.54623,0.533333,0.559767,0.54623
2,0.0273,0.058904,0.550725,0.576812,0.563466,0.550725,0.576812,0.563466
3,0.016,0.068212,0.562319,0.585507,0.573679,0.562319,0.585507,0.573679
4,0.0101,0.07524,0.576812,0.598837,0.587618,0.576812,0.598837,0.587618


***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
  Num examples = 219
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-142
Configuration saved in ./t5/checkpoint-142/config.json
Saving model checkpoint to ./t5/checkpoint-142
Configuration saved in ./t5/checkpoint-142/config.json
Model weights saved in ./t5/checkpoint-142/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-142/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-142/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-142/spiece.model
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
***** Running Evaluation *****
  Num examples = 219
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-284
Configuration saved in ./t5/checkpoint-284/config.json
Saving model checkpoint to ./t5/checkpoint-284
Configuration saved in ./t5/checkpoint-284/config.json
Model weights saved in ./t5/checkpoint-284/pytorch_model.bin
tokenizer config file s

TrainOutput(global_step=2840, training_loss=0.020601016940207967, metrics={'train_runtime': 1453.2227, 'train_samples_per_second': 62.344, 'train_steps_per_second': 1.954, 'total_flos': 1.282296902602752e+16, 'train_loss': 0.020601016940207967, 'epoch': 20.0})

In [47]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["lap14"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:11,  1.00s/it]


In [48]:
targets = [catch_answer(el,"oas") for el in peng_2["lap14"]["test"]["output"]]

In [49]:
summary_score(preds,targets)

{'recall': 0.5951940850277264,
 'precision': 0.5940959409594095,
 'f1_score': 0.5946445060018467}

In [50]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
del model
torch.cuda.empty_cache()

In [52]:
result = save_result(str_preds, preds, targets, "peng_lap14.json")

# Peng Restaurant 2014

In [53]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res14"]["train"],
        eval_dataset = peng_tok["res14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oas Recall,Oas Precision,Oas F1 Score
1,0.2209,0.0449,0.590278,0.609842,0.5999,0.590278,0.609842,0.5999
2,0.019,0.045037,0.618056,0.631579,0.624744,0.618056,0.631579,0.624744
3,0.0109,0.054908,0.611111,0.624782,0.617871,0.611111,0.624782,0.617871


***** Running Evaluation *****
  Num examples = 310
  Batch size = 32
  Num examples = 310
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-198
Configuration saved in ./t5/checkpoint-198/config.json
Saving model checkpoint to ./t5/checkpoint-198
Configuration saved in ./t5/checkpoint-198/config.json
Model weights saved in ./t5/checkpoint-198/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-198/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-198/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-198/spiece.model
***** Running Evaluation *****
  Num examples = 310
  Batch size = 32
***** Running Evaluation *****
  Num examples = 310
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-396
Configuration saved in ./t5/checkpoint-396/config.json
Saving model checkpoint to ./t5/checkpoint-396
Configuration saved in ./t5/checkpoint-396/config.json
Model weights saved in ./t5/checkpoint-396/pytorch_model.bin
tokenizer config file s

TrainOutput(global_step=3960, training_loss=0.013948131309389466, metrics={'train_runtime': 2508.4329, 'train_samples_per_second': 50.47, 'train_steps_per_second': 1.579, 'total_flos': 2.33373120841728e+16, 'train_loss': 0.013948131309389466, 'epoch': 20.0})

In [54]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res14"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

16it [00:20,  1.28s/it]


In [55]:
targets = [catch_answer(el,"oas") for el in peng_2["res14"]["test"]["output"]]

In [56]:
summary_score(preds,targets)

{'recall': 0.7152917505030181,
 'precision': 0.7244174265450861,
 'f1_score': 0.719825666723541}

In [57]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [58]:
result = save_result(str_preds, preds, targets, "peng_res14.json")

# Peng Restaurant 2015

In [59]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res15"]["train"],
        eval_dataset = peng_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oas Recall,Oas Precision,Oas F1 Score
1,0.4143,0.062185,0.618474,0.634538,0.626403,0.618474,0.634538,0.626403
2,0.0149,0.057951,0.666667,0.674699,0.670659,0.666667,0.674699,0.670659
3,0.0081,0.061996,0.706827,0.710843,0.70883,0.706827,0.710843,0.70883
4,0.0047,0.071031,0.690763,0.698795,0.694756,0.690763,0.698795,0.694756
5,0.0033,0.071471,0.694779,0.702811,0.698772,0.694779,0.702811,0.698772
6,0.0025,0.07922,0.73494,0.73494,0.73494,0.73494,0.73494,0.73494
7,0.0024,0.078953,0.702811,0.710843,0.706804,0.702811,0.710843,0.706804
8,0.0022,0.079194,0.730924,0.738956,0.734918,0.730924,0.738956,0.734918
9,0.0013,0.092307,0.722892,0.730924,0.726885,0.722892,0.730924,0.726885


***** Running Evaluation *****
  Num examples = 148
  Batch size = 32
  Num examples = 148
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-95
Configuration saved in ./t5/checkpoint-95/config.json
Saving model checkpoint to ./t5/checkpoint-95
Configuration saved in ./t5/checkpoint-95/config.json
Model weights saved in ./t5/checkpoint-95/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-95/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-95/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-95/spiece.model
***** Running Evaluation *****
  Num examples = 148
  Batch size = 32
***** Running Evaluation *****
  Num examples = 148
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-190
Configuration saved in ./t5/checkpoint-190/config.json
Saving model checkpoint to ./t5/checkpoint-190
Configuration saved in ./t5/checkpoint-190/config.json
Model weights saved in ./t5/checkpoint-190/pytorch_model.bin
tokenizer config file saved in 

TrainOutput(global_step=1900, training_loss=0.022991722289864954, metrics={'train_runtime': 1166.5215, 'train_samples_per_second': 51.864, 'train_steps_per_second': 1.629, 'total_flos': 1.050571298304e+16, 'train_loss': 0.022991722289864954, 'epoch': 20.0})

In [60]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:11,  1.08s/it]


In [61]:
targets = [catch_answer(el,"oas") for el in peng_2["res15"]["test"]["output"]]

In [62]:
summary_score(preds,targets)

{'recall': 0.6350515463917525,
 'precision': 0.5849056603773585,
 'f1_score': 0.6089479894129276}

In [63]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [64]:
result = save_result(str_preds, preds, targets, "peng_res15.json")

# Peng Restaurant 2016

In [65]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res16"]["train"],
        eval_dataset = peng_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oas Recall,Oas Precision,Oas F1 Score
1,0.291,0.045238,0.690265,0.707965,0.699003,0.690265,0.707965,0.699003
2,0.0164,0.048301,0.693215,0.709581,0.701303,0.693215,0.709581,0.701303


***** Running Evaluation *****
  Num examples = 210
  Batch size = 32
  Num examples = 210
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-134
Configuration saved in ./t5/checkpoint-134/config.json
Saving model checkpoint to ./t5/checkpoint-134
Configuration saved in ./t5/checkpoint-134/config.json
Model weights saved in ./t5/checkpoint-134/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-134/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-134/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-134/spiece.model
***** Running Evaluation *****
  Num examples = 210
  Batch size = 32
***** Running Evaluation *****
  Num examples = 210
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-268
Configuration saved in ./t5/checkpoint-268/config.json
Saving model checkpoint to ./t5/checkpoint-268
Configuration saved in ./t5/checkpoint-268/config.json
Model weights saved in ./t5/checkpoint-268/pytorch_model.bin
tokenizer config file s

TrainOutput(global_step=2680, training_loss=0.017107220226203774, metrics={'train_runtime': 1556.9113, 'train_samples_per_second': 55.045, 'train_steps_per_second': 1.721, 'total_flos': 1.4881646324736e+16, 'train_loss': 0.017107220226203774, 'epoch': 20.0})

In [66]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:18,  1.72s/it]


In [67]:
targets = [catch_answer(el,"oas") for el in peng_2["res16"]["test"]["output"]]

In [68]:
summary_score(preds,targets)

{'recall': 0.7354085603112841,
 'precision': 0.6786355475763016,
 'f1_score': 0.7058823529411765}

In [69]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
result = save_result(str_preds, preds, targets, "peng_res16.json")

# Wan Restaurant 2015

In [71]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = wan_tok["res15"]["train"],
        eval_dataset = wan_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=3500, training_loss=0.015522921905985901, metrics={'train_runtime': 2334.1643, 'train_samples_per_second': 47.983, 'train_steps_per_second': 1.499, 'total_flos': 2.091321247629312e+16, 'train_loss': 0.015522921905985901, 'epoch': 20.0})

In [72]:
str_preds = generate_predictions(model, tokenizer_en, wan_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"asc") for el in str_preds]

19it [00:28,  1.49s/it]


In [73]:
targets = [catch_answer(el,"asc") for el in wan_2["res15"]["test"]["output"]]

In [74]:
summary_score(preds,targets)

{'recall': 0.5857988165680473,
 'precision': 0.6309963099630996,
 'f1_score': 0.6075581395348837}

In [75]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [76]:
result = save_result(str_preds, preds, targets, "wan_res15.json")

# Wan Restaurant 2016

In [77]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = wan_tok["res16"]["train"],
        eval_dataset = wan_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 29
  Batch size = 32
  Num examples = 29
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-267
Configuration saved in ./t5/checkpoint-267/config.json
Saving model checkpoint to ./t5/checkpoint-267
Configuration saved in ./t5/checkpoint-267/config.json
Model weights saved in ./t5/checkpoint-267/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-267/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-267/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-267/spiece.model
***** Running Evaluation *****
  Num examples = 29
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-534
Configuration saved in ./t5/checkpoint-534/config.json
Model weights saved in ./t5/checkpoint-534/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-534/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-534/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-534/spiece.model


TrainOutput(global_step=5340, training_loss=0.011299087121133948, metrics={'train_runtime': 3511.3328, 'train_samples_per_second': 48.642, 'train_steps_per_second': 1.521, 'total_flos': 3.188101983363072e+16, 'train_loss': 0.011299087121133948, 'epoch': 20.0})

In [78]:
str_preds = generate_predictions(model, tokenizer_en, wan_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"asc") for el in str_preds]

19it [00:31,  1.64s/it]


In [79]:
targets = [catch_answer(el,"asc") for el in wan_2["res16"]["test"]["output"]]

In [80]:
summary_score(preds,targets)

{'recall': 0.6880093131548312,
 'precision': 0.7033096926713948,
 'f1_score': 0.6955753735321548}

In [81]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [82]:
result = save_result(str_preds, preds, targets, "wan_res16.json")

# Zhang Restaurant 2015

In [83]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = zhang_tok["res15"]["train"],
        eval_dataset = zhang_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oasc Recall,Oasc Precision,Oasc F1 Score
1,0.2404,0.061083,0.391931,0.398256,0.395068,0.391931,0.398256,0.395068
2,0.0185,0.060917,0.458213,0.469388,0.463733,0.458213,0.469388,0.463733
3,0.0106,0.069981,0.469741,0.476744,0.473216,0.469741,0.476744,0.473216


***** Running Evaluation *****
  Num examples = 209
  Batch size = 32
  Num examples = 209
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-183
Configuration saved in ./t5/checkpoint-183/config.json
Saving model checkpoint to ./t5/checkpoint-183
Configuration saved in ./t5/checkpoint-183/config.json
Model weights saved in ./t5/checkpoint-183/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-183/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-183/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-183/spiece.model
***** Running Evaluation *****
  Num examples = 209
  Batch size = 32
***** Running Evaluation *****
  Num examples = 209
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-366
Configuration saved in ./t5/checkpoint-366/config.json
Saving model checkpoint to ./t5/checkpoint-366
Configuration saved in ./t5/checkpoint-366/config.json
Model weights saved in ./t5/checkpoint-366/pytorch_model.bin
tokenizer config file s

TrainOutput(global_step=3660, training_loss=0.0151078967784679, metrics={'train_runtime': 2349.8352, 'train_samples_per_second': 49.689, 'train_steps_per_second': 1.558, 'total_flos': 1.555141912713216e+16, 'train_loss': 0.0151078967784679, 'epoch': 20.0})

In [84]:
str_preds = generate_predictions(model, tokenizer_en, zhang_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oasc") for el in str_preds]

17it [00:42,  2.48s/it]


In [85]:
targets = [catch_answer(el,"oasc") for el in zhang_2["res15"]["test"]["output"]]

In [86]:
summary_score(preds,targets)

{'recall': 0.4641509433962264,
 'precision': 0.462111801242236,
 'f1_score': 0.46312912776133}

In [87]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [88]:
result = save_result(str_preds, preds, targets, "zhang_res15.json")

# Zhang Restaurant 2016

In [89]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = zhang_tok["res16"]["train"],
        eval_dataset = zhang_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-base/snapshots/fe6d9bf207cd3337512ca838a8b453f87a9178ef/config.json
Model config T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_si

Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oasc Recall,Oasc Precision,Oasc F1 Score
1,0.176,0.054857,0.529644,0.534791,0.532205,0.529644,0.534791,0.532205
2,0.0186,0.066848,0.527668,0.531746,0.529699,0.527668,0.531746,0.529699
3,0.0106,0.07275,0.561265,0.565737,0.563492,0.561265,0.565737,0.563492
4,0.0067,0.079248,0.565217,0.569444,0.567323,0.565217,0.569444,0.567323


***** Running Evaluation *****
  Num examples = 316
  Batch size = 32
  Num examples = 316
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-277
Configuration saved in ./t5/checkpoint-277/config.json
Saving model checkpoint to ./t5/checkpoint-277
Configuration saved in ./t5/checkpoint-277/config.json
Model weights saved in ./t5/checkpoint-277/pytorch_model.bin
tokenizer config file saved in ./t5/checkpoint-277/tokenizer_config.json
Special tokens file saved in ./t5/checkpoint-277/special_tokens_map.json
Copy vocab file to ./t5/checkpoint-277/spiece.model
***** Running Evaluation *****
  Num examples = 316
  Batch size = 32
***** Running Evaluation *****
  Num examples = 316
  Batch size = 32
Saving model checkpoint to ./t5/checkpoint-554
Configuration saved in ./t5/checkpoint-554/config.json
Saving model checkpoint to ./t5/checkpoint-554
Configuration saved in ./t5/checkpoint-554/config.json
Model weights saved in ./t5/checkpoint-554/pytorch_model.bin
tokenizer config file s

TrainOutput(global_step=5540, training_loss=0.011752677950268403, metrics={'train_runtime': 3969.6364, 'train_samples_per_second': 44.578, 'train_steps_per_second': 1.396, 'total_flos': 3.364751152447488e+16, 'train_loss': 0.011752677950268403, 'epoch': 20.0})

In [90]:
str_preds = generate_predictions(model, tokenizer_en, zhang_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oasc") for el in str_preds]

17it [01:00,  3.53s/it]


In [91]:
targets = [catch_answer(el,"oasc") for el in zhang_2["res16"]["test"]["output"]]

In [92]:
summary_score(preds,targets)

{'recall': 0.5857321652065082,
 'precision': 0.5707317073170731,
 'f1_score': 0.5781346510191475}

In [93]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [94]:
result = save_result(str_preds, preds, targets, "zhang_res16.json")

# William Hotel

In [44]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_id,
        data_collator = data_collator_en,
        train_dataset = william_tok["hotel"]["train"],
        eval_dataset = william_tok["hotel"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_id,william_2["hotel"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 15000
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 9380
  Number of trainable parameters = 582401280
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Overall Recall,Overall Precision,Overall F1 Score,Oas Recall,Oas Precision,Oas F1 Score
1,2.8921,0.103867,0.05171,0.091962,0.066197,0.05171,0.091962,0.066197
2,0.0677,0.021838,0.700326,0.731749,0.715692,0.700326,0.731749,0.715692
3,0.0733,0.019236,0.740635,0.77292,0.756433,0.740635,0.77292,0.756433
4,0.0199,0.017004,0.787866,0.815579,0.801483,0.787866,0.815579,0.801483
5,0.0134,0.017104,0.802932,0.830109,0.816294,0.802932,0.830109,0.816294
6,0.0092,0.018516,0.799674,0.824517,0.811906,0.799674,0.824517,0.811906
7,0.0068,0.018313,0.820033,0.844156,0.831919,0.820033,0.844156,0.831919
8,0.0053,0.019332,0.815961,0.839615,0.827619,0.815961,0.839615,0.827619


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-469
Configuration saved in ./output/checkpoint-469/config.json
Saving model checkpoint to ./output/checkpoint-469
Configuration saved in ./output/checkpoint-469/config.json
Model weights saved in ./output/checkpoint-469/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-469/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-469/special_tokens_map.json
Copy vocab file to ./output/checkpoint-469/spiece.model
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-938
Configuration saved in ./output/checkpoint-938/config.json
Saving model checkpoint to ./output/checkpoint-938
Configuration saved in ./output/checkpoint-938/config.json
Model weights saved in ./output/checkpoint-938/pytorch_model.bin
tokenizer config file saved in ./outpu

TrainOutput(global_step=9380, training_loss=0.1555819763525971, metrics={'train_runtime': 14081.3567, 'train_samples_per_second': 21.305, 'train_steps_per_second': 0.666, 'total_flos': 1.2467638586821018e+17, 'train_loss': 0.1555819763525971, 'epoch': 20.0})

In [45]:
str_preds = generate_predictions(model, tokenizer_id, william_tok["hotel"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

32it [01:34,  2.96s/it]


In [46]:
targets = [catch_answer(el,"oas") for el in william_2["hotel"]["test"]["output"]]

In [47]:
summary_score(preds,targets)

{'recall': 0.7692875599852345,
 'precision': 0.817149569303054,
 'f1_score': 0.79249657828438}

In [48]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [49]:
result = save_result(str_preds, preds, targets, "william_hotel.json")