In [1]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
n_gpu = torch.cuda.device_count()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
import sys
from transformers import AutoModelForCausalLM, AutoTokenizer
sys.path.append("../../../src/")
import data_utils

# Dataset Utilities

In [4]:
peng_dir = dict(
    lap14 = "../../../data/absa/en/peng/14lap",
    res14 = "../../../data/absa/en/peng/14res",
    res15 = "../../../data/absa/en/peng/15res",
    res16 = "../../../data/absa/en/peng/16res"
)

wan_dir = dict(
    res15 = "../../../data/absa/en/wan/interim/rest15",
    res16 = "../../../data/absa/en/wan/interim/rest16"
)
    
zhang_dir = dict(
    res15 = "../../../data/absa/en/zhang/interim/interim_2/rest15",
    res16 = "../../../data/absa/en/zhang/interim/interim_2/rest16"
)

william_dir = dict(
    hotel = "../../../data/absa/id/william"
)

peng = dict(
    lap14 = dict(
        train = data_utils.read_data(path=peng_dir["lap14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["lap14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["lap14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res14 = dict(
        train = data_utils.read_data(path=peng_dir["res14"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res14"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res14"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res15 = dict(
        train = data_utils.read_data(path=peng_dir["res15"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res15"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res15"] + "/test_triplets.txt",
                                     target_format="aos")
    ),
    res16 = dict(
        train = data_utils.read_data(path=peng_dir["res16"] + "/train_triplets.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=peng_dir["res16"] + "/dev_triplets.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=peng_dir["res16"] + "/test_triplets.txt",
                                     target_format="aos")
    )
)

wan = dict(
    res15 = dict(
        train = data_utils.read_data(path=wan_dir["res15"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res15"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res15"] + "/test.txt",
                                     target_format="acs")
    ),
    res16 = dict(
        train = data_utils.read_data(path=wan_dir["res16"] + "/train.txt",
                                     target_format="acs"),
        val = data_utils.read_data(path=wan_dir["res16"] + "/dev.txt",
                                     target_format="acs"),
        test = data_utils.read_data(path=wan_dir["res16"] + "/test.txt",
                                     target_format="acs")
    )
)

zhang = dict(
    res15 = dict(
        train = data_utils.read_data(path=zhang_dir["res15"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res15"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res15"] + "/test.txt",
                                     target_format="acso")
    ),
    res16 = dict(
        train = data_utils.read_data(path=zhang_dir["res16"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=zhang_dir["res16"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=zhang_dir["res16"] + "/test.txt",
                                     target_format="acso")
    )
)

william = dict(
    hotel = dict(
        train = data_utils.read_data(path=william_dir["hotel"] + "/train.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=william_dir["hotel"] + "/dev.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=william_dir["hotel"] + "/test.txt",
                                     target_format="aos")
    )
)

# Data Preprocessing 1

In [None]:
data_utils.SENTIMENT_ELEMENT = {'a' : "aspect", 'o' : "opinion", 's' : "sentiment", 'c' : "category"}

1. AOS (ASTE)
    * AO
    * AS
    * A
    * O

2. ACS (TASD)
    * AS
    * CS
    * A
    * C

3. ACOS
    * AO
    * AS
    * CS
    * A
    * O
    * C

In [None]:
task_tree = {
    "oas" : ["oas","oa","as",'a','o'],
    "asc" : ["asc","as","sc",'a','c'],
    "oasc" : ["oasc","oa","as","sc",'a','o','c']
}

all_task = []
for k,v1 in task_tree.items():
    if k not in all_task:
        all_task.append(k)
    for v2 in v1:
        if v2 not in all_task:
            all_task.append(v2)

print(all_task)

['oas', 'oa', 'as', 'a', 'o', 'asc', 'sc', 'c', 'oasc']


In [None]:
data_utils.remove_duplicate_targets(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"ao"))

[{'aspect': 'battery life', 'opinion': 'good'}]

Handle mix may not be a must, but we'll see it later. Will be problematic if like as (UABSA / E2E ABSA) used for training AOS (ASTE) --> may be for further experiment because we will insert imputing later on

In [None]:
data_utils.handle_mix_sentiment(data_utils.reduce_targets([{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "positive"},{'aspect': 'battery life', 'opinion': 'good', "sentiment" : "negative"}],"aos"))

[{'aspect': 'battery life', 'opinion': 'good', 'sentiment': 'mixed'}]

In [None]:
from copy import deepcopy

# Peng (ASTE/AOS)
peng_intermediate = dict()

for domain, v1 in peng.items():
    peng_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        peng_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = peng[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            peng_intermediate[domain][task][split] = ds_copy

In [None]:
# Wan (TASD/ACS)
wan_intermediate = dict()

for domain, v1 in wan.items():
    wan_intermediate[domain] = dict()
    for task in ["asc"] + task_tree["asc"]:
        wan_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = wan[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            wan_intermediate[domain][task][split] = ds_copy

In [None]:
# Zhang (ACOS)
zhang_intermediate = dict()

for domain, v1 in zhang.items():
    zhang_intermediate[domain] = dict()
    for task in ["oasc"] + task_tree["oasc"]:
        zhang_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = zhang[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            zhang_intermediate[domain][task][split] = ds_copy

In [None]:
# William (AOS ID)
william_intermediate = dict()

for domain, v1 in william.items():
    william_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        william_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = william[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            william_intermediate[domain][task][split] = ds_copy

# Answer Engineering

In [None]:
mask = "<extra_id_X>"

In [None]:
added_tokens = {
    ',' : "<comma>",
    '(' : "<open_bracket>",
    ')' : "<close_bracket>",
    ';' : "<semicolon>"
}

In [None]:
# def construct_answer(targets,se_order):
#     result = []
#     counter = 0
#     for t in targets:
#         constructed_t = ""
#         for se in se_order:
#             if counter > 99:
#                 raise Exception("Extra id more than 99!")
#             constructed_t += ' ' + mask.replace('X',str(counter)) + ' ' + t[data_utils.SENTIMENT_ELEMENT[se]]
#             counter += 1
#         constructed_t = constructed_t.strip()
#         result.append(constructed_t)
#     result = " ; ".join(result)
#     return result
def construct_answer(targets,se_order):
    result = []
    for t in targets:
        constructed_t = []
        for se in se_order:
            element = t[data_utils.SENTIMENT_ELEMENT[se]]
            for k, v in added_tokens.items():
                element = element.replace(k,v)
            constructed_t.append(element)
        constructed_t = " , ".join(constructed_t)
        constructed_t = f"( {constructed_t} )"
        result.append(constructed_t)
    result = " ; ".join(result)
    return result

In [None]:
construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")

'( no , GUI , negative ) ; ( dark , screen , negative ) ; ( steady , power light , neutral ) ; ( steady , hard drive light , negative )'

In [None]:
construct_answer([{"aspect" : "tes1 , tes2", "opinion" : "( tes3 ; tes4 )", "sentiment" : "positive"}],"oas")

'( <open_bracket> tes3 <semicolon> tes4 <close_bracket> , tes1 <comma> tes2 , positive )'

# Prompt Engineering

In [None]:
# def construct_prompt(text,se_order):
#     prompt = []
#     for counter, se in enumerate(se_order):
#         prompt.append(data_utils.SENTIMENT_ELEMENT[se] + " : " + mask.replace('X',str(counter)))
#     prompt = " ,".join(prompt)
#     result = text + "| " + prompt
#     return result
def construct_prompt(text,se_order):
    prompt = []
    for se in se_order:
        prompt.append(data_utils.SENTIMENT_ELEMENT[se])
    prompt = " , ".join(prompt)
    prompt = f"( {prompt} )"
    masked_text = text
    for k, v in added_tokens.items():
        masked_text = masked_text.replace(k,v)
    result = masked_text + " | " + prompt
    return result

In [None]:
construct_prompt(peng_intermediate["lap14"]["oas"]["train"][4]["text"],"oas")

'One night I turned the freaking thing off after using it <comma> the next day I turn it on <comma> no GUI <comma> screen all dark <comma> power light steady <comma> hard drive light steady and not flashing as it usually does . | ( opinion , aspect , sentiment )'

# Answer Catch

In [None]:
import re

# def catch_answer(output,se_order):
#     output = output.replace("<pad>",'')
#     output = output.replace("</s>",'')
#     pattern = r""
#     for se in se_order:
#         if se != 's':
#             pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*"
#         else:
#             pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*"
#     found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
#     for i in range(len(found)):
#         for k, v in found[i].items():
#             found[i][k] = found[i][k].strip()
#     return found
def catch_answer(output,se_order):
    # output = output.replace("<pad>",'')
    # output = output.replace("</s>",'')
    pattern = []
    for se in se_order:
        if se != 's':
            pattern.append(f"\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*")
        else:
            pattern.append(f"\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*")
    pattern = ','.join(pattern)
    pattern = f"\({pattern}\)"
    found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
    for i in range(len(found)):
        for k, v in found[i].items():
            found[i][k] = found[i][k].strip()
    return found

In [None]:
output = construct_answer(peng_intermediate["lap14"]["oas"]["train"][4]["target"],"oas")
se_order = "oas"
catch_answer(output,se_order)

[{'opinion': 'no', 'aspect': 'GUI', 'sentiment': 'negative'},
 {'opinion': 'dark', 'aspect': 'screen', 'sentiment': 'negative'},
 {'opinion': 'steady', 'aspect': 'power light', 'sentiment': 'neutral'},
 {'opinion': 'steady', 'aspect': 'hard drive light', 'sentiment': 'negative'}]

In [None]:
output

'( no , GUI , negative ) ; ( dark , screen , negative ) ; ( steady , power light , neutral ) ; ( steady , hard drive light , negative )'

# Data Preprocessing 2

In [None]:
from datasets import Dataset

peng_2 = dict()
for domain, v1 in peng_intermediate.items():
    peng_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in peng_intermediate[domain][basic_task]["train"]:
            peng_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in peng_intermediate[domain]["oas"]["val"]:
        peng_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in peng_intermediate[domain]["oas"]["test"]:
        peng_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    peng_2[domain]["train"] = Dataset.from_list(peng_2[domain]["train"])
    peng_2[domain]["val"] = Dataset.from_list(peng_2[domain]["val"])
    peng_2[domain]["test"] = Dataset.from_list(peng_2[domain]["test"])

wan_2 = dict()
for domain, v1 in wan_intermediate.items():
    wan_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["asc"]:
        for el in wan_intermediate[domain][basic_task]["train"]:
            wan_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in wan_intermediate[domain]["asc"]["val"]:
        wan_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    # TEST
    for el in wan_intermediate[domain]["asc"]["test"]:
        wan_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"asc"),
                "output" : construct_answer(el["target"],"asc"),
                "task" : "asc"
            })
    wan_2[domain]["train"] = Dataset.from_list(wan_2[domain]["train"])
    wan_2[domain]["val"] = Dataset.from_list(wan_2[domain]["val"])
    wan_2[domain]["test"] = Dataset.from_list(wan_2[domain]["test"])

zhang_2 = dict()
for domain, v1 in zhang_intermediate.items():
    zhang_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oasc"]:
        for el in zhang_intermediate[domain][basic_task]["train"]:
            zhang_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in zhang_intermediate[domain]["oasc"]["val"]:
        zhang_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    # TEST
    for el in zhang_intermediate[domain]["oasc"]["test"]:
        zhang_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oasc"),
                "output" : construct_answer(el["target"],"oasc"),
                "task" : "oasc"
            })
    zhang_2[domain]["train"] = Dataset.from_list(zhang_2[domain]["train"])
    zhang_2[domain]["val"] = Dataset.from_list(zhang_2[domain]["val"])
    zhang_2[domain]["test"] = Dataset.from_list(zhang_2[domain]["test"])

william_2 = dict()
for domain, v1 in william_intermediate.items():
    william_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in william_intermediate[domain][basic_task]["train"]:
            william_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for el in william_intermediate[domain]["oas"]["val"]:
        william_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    # TEST
    for el in william_intermediate[domain]["oas"]["test"]:
        william_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["target"],"oas"),
                "task" : "oas"
            })
    william_2[domain]["train"] = Dataset.from_list(william_2[domain]["train"])
    william_2[domain]["val"] = Dataset.from_list(william_2[domain]["val"])
    william_2[domain]["test"] = Dataset.from_list(william_2[domain]["test"])

In [None]:
william_2["hotel"]["train"][69]

{'input': 'tempat yag bagus dan nyaman untuk istirahat tetapi tolong tvnya perlu di perbaiki channelnya karena banyak semutnya digambar dan water heaternya tidak bisa jadi mandi air dingin terus . | ( opinion , aspect , sentiment )',
 'output': '( bagus , tempat , positive ) ; ( nyaman , tempat , positive ) ; ( perlu di perbaiki , tvnya , positive ) ; ( tidak bisa , water heaternya , negative )',
 'task': 'oas'}

# Prepare Tokenized Dataset

## English

In [None]:
tokenizer_en = AutoTokenizer.from_pretrained("gpt2-large", padding_side="left")

Downloading: 100%|██████████| 560/560 [00:00<00:00, 326kB/s]
Downloading: 100%|██████████| 1.01k/1.01k [00:00<00:00, 1.10MB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 3.50MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 2.02MB/s]
Downloading: 100%|██████████| 357/357 [00:00<00:00, 262kB/s]


In [None]:
# tokenizer_en.add_tokens(list(added_tokens.values()))

In [None]:
encoding_args = {
    "max_length" : 512,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

In [None]:
resize_en = False
if tokenizer_en.pad_token == None:
    pad_token = "<|pad|>"
    tokenizer_en.add_tokens([pad_token])
    tokenizer_en.add_special_tokens({"pad_token": pad_token})
    resize_en = True

if tokenizer_en.sep_token == None:
    sep_token = "<|sep|>"
    tokenizer_en.add_tokens([sep_token])
    tokenizer_en.add_special_tokens({"sep_token": sep_token})
    resize_en = True

Using pad_token, but it is not set yet.
Using sep_token, but it is not set yet.


In [None]:
def encode_en(dataset):
    causal_lm_input =[row_input + ' ' + tokenizer_en.sep_token + ' ' + row_output + ' ' + tokenizer_en.eos_token
                      for row_input, row_output in zip(dataset["input"],dataset["output"])]
    result = tokenizer_en(causal_lm_input, **encoding_args)
    return result

In [None]:
peng_tok = dict()
for domain, v1 in peng_2.items():
    peng_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            peng_tok[domain][split] = peng_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            test_input = [row_input + ' ' + tokenizer_en.sep_token for row_input in peng_2[domain][split]["input"]]
            peng_tok[domain][split] = tokenizer_en(test_input, **encoding_args)

                                                                  

In [None]:
wan_tok = dict()
for domain, v1 in wan_2.items():
    wan_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            wan_tok[domain][split] = wan_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            test_input = [row_input + ' ' + tokenizer_en.sep_token for row_input in wan_2[domain][split]["input"]]
            wan_tok[domain][split] = tokenizer_en(test_input, **encoding_args)

                                                                  

In [None]:
zhang_tok = dict()
for domain, v1 in zhang_2.items():
    zhang_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            zhang_tok[domain][split] = zhang_2[domain][split].map(encode_en,batched=True,remove_columns=["input","output","task"])
        else:
            test_input = [row_input + ' ' + tokenizer_en.sep_token for row_input in zhang_2[domain][split]["input"]]
            zhang_tok[domain][split] = tokenizer_en(test_input, **encoding_args)

                                                                  

## Indo

In [None]:
tokenizer_id = AutoTokenizer.from_pretrained("facebook/xglm-564M", padding_side="left")

In [None]:
# tokenizer_id.add_tokens(list(added_tokens.values()))

In [None]:
resize_id = False
if tokenizer_id.pad_token == None:
    pad_token = "<|pad|>"
    tokenizer_id.add_tokens([pad_token])
    tokenizer_id.add_special_tokens({"pad_token": pad_token})
    resize_id = True

if tokenizer_id.sep_token == None:
    sep_token = "<|sep|>"
    tokenizer_id.add_tokens([sep_token])
    tokenizer_id.add_special_tokens({"sep_token": sep_token})
    resize_id = True

In [None]:
def encode_id(dataset):
    causal_lm_input =[row_input + ' ' + tokenizer_en.sep_token + ' ' + row_output + ' ' + tokenizer_en.eos_token
                      for row_input, row_output in zip(dataset["input"],dataset["output"])]
    result = tokenizer_id(causal_lm_input, **encoding_args)
    return result

In [None]:
william_tok = dict()
for domain, v1 in william_2.items():
    william_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            william_tok[domain][split] = william_2[domain][split].map(encode_id,batched=True,remove_columns=["input","output","task"])
        else:
            test_input = [row_input + ' ' + tokenizer_id.sep_token for row_input in william_2[domain][split]["input"]]
            william_tok[domain][split] = tokenizer_id(test_input, **encoding_args)

                                                                   

# Data Collator

## English

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator_en = DataCollatorForLanguageModeling(tokenizer=tokenizer_en,mlm=False)

## Indo

In [None]:
data_collator_id = DataCollatorForLanguageModeling(tokenizer=tokenizer_id,mlm=False)

# Compute Metrics

In [None]:
from transformers import EvalPrediction
from evaluation import recall, precision, f1_score, summary_score
from typing import List, Dict, Tuple
import numpy as np

def seperate_target_prediction_per_task(predictions:List[List[Dict]],targets:List[List[Dict]],tasks:List) -> Tuple[Dict[str,List],Dict[str,List]]:
    per_task_targets = {}
    per_task_predictions = {}
    for target, prediction, task in zip(targets,predictions,tasks):
        if task not in per_task_targets.keys():
            per_task_targets[task] = []
        if task not in per_task_predictions.keys():
            per_task_predictions[task] = []
        per_task_targets[task].append(target)
        per_task_predictions[task].append(prediction)
    return per_task_targets, per_task_predictions

def preprocess_eval_preds(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer):
    input_ids = eval_preds.inputs
    target_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions

    # In case the model returns more than the prediction logits
    if isinstance(input_ids, tuple):
        input_ids = input_ids[0]
    if isinstance(target_ids, tuple):
        target_ids = target_ids[0]
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    input_ids = np.argmax(input_ids,axis=-1) if len(input_ids.shape) == 3 else input_ids # in case not predict with generate
    target_ids = np.argmax(target_ids,axis=-1) if len(target_ids.shape) == 3 else target_ids # in case not predict with generate
    prediction_ids = np.argmax(pred_ids,axis=-1) if len(pred_ids.shape) == 3 else pred_ids # in case not predict with generate

    input_ids = [[token for token in row if token != -100] for row in input_ids]
    target_ids = [[token for token in row if token != -100] for row in target_ids]
    prediction_ids = [[token for token in row if token != -100] for row in prediction_ids]

    inputs = tokenizer.batch_decode(input_ids,**decoding_args)
    targets = tokenizer.batch_decode(target_ids,**decoding_args)
    predictions = tokenizer.batch_decode(prediction_ids,**decoding_args)

    return inputs, targets, predictions

def compute_metrics(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer,tasks:List) -> Dict[str,float]: # MAY NOT BE SUFFICIATE FOR CAUSAL LM
        """
        ### DESC
            Method to compute the metrics.
        ### PARAMS
        * eval_preds: EvalPrediction instance from training.
        * decoding_args: Decoding arguments.
        ### RETURN
        * metrics: Dictionary of metrics.
        """
        inputs, targets, predictions = preprocess_eval_preds(eval_preds,decoding_args,tokenizer)

        targets = [catch_answer(text,task) for text,task in zip(targets,tasks) if task != "non_absa"]
        predictions = [catch_answer(text,task) for text,task in zip(predictions,tasks) if task != "non_absa"]


        per_task_targets, per_task_predictions = seperate_target_prediction_per_task(predictions, targets, tasks)
        
        metrics = {}

        metrics["overall_recall"] = recall(predictions,targets)
        metrics["overall_precision"] = precision(predictions,targets)
        metrics["overall_f1_score"] = f1_score(predictions,targets)

        for task in per_task_targets.keys():
            if task == "non_absa":
                continue
            metrics[f"{task}_recall"] = recall(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_precision"] = precision(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_f1_score"] = f1_score(per_task_predictions[task],per_task_targets[task])
        
        return metrics

# Train Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 10,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 2,
    "per_device_train_batch_size": 16//n_gpu,
    "per_device_eval_batch_size": 16//n_gpu,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    # "metric_for_best_model": "overall_f1_score",
    # "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./output",
    "logging_dir" : "./output/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

# Train

In [None]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [None]:
from transformers import Seq2SeqTrainer

# trainer = {
#     "peng" : {},
#     "wan" : {},
#     "zhang" : {},
#     "william" : {}
# }

decoding_args = {
    "skip_special_tokens" : False
}

def preprocess_logits_for_metrics(logits, targets):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, targets

In [None]:
from tqdm import tqdm

def generate_predictions(model,tokenizer,tokenized:torch.Tensor,device:torch.device=torch.device("cpu"),batch_size:int=16,max_len:int=512,decoding_args:Dict={}) -> List[str]:
    # Data loader
    input_ids_data_loader = torch.utils.data.DataLoader(tokenized["input_ids"],
                        batch_size=batch_size,shuffle=False)
    attention_mask_data_loader = torch.utils.data.DataLoader(tokenized["attention_mask"],
                        batch_size=batch_size,shuffle=False)
    # Predict
    model = model
    tokenizer = tokenizer
    tensor_predictions = []
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(zip(input_ids_data_loader,attention_mask_data_loader)):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            tensor_predictions.extend(model.generate(input_ids=input_ids,attention_mask=attention_mask,max_length=max_len,pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id).cpu())
            input_ids = input_ids.cpu()
            attention_mask = attention_mask.cpu()
    tensor_predictions = [[token for token in row if token != -100] for row in tensor_predictions]
    predictions = tokenizer.batch_decode(tensor_predictions,**decoding_args)
    predictions = [el.split(tokenizer.sep_token)[-1] for el in predictions]
    return predictions

In [None]:
import json

def save_result(str_preds_,preds,targets,filename):
    result = []
    str_preds = [el.replace("<pad>",'') for el in str_preds_]
    assert len(str_preds) == len(preds) == len(targets)
    for i in range(len(str_preds)):
        result.append({
            "str_pred" : str_preds[i],
            "pred" : preds[i],
            "target" : targets[i]
        })
    
    with open(filename,'w') as fp:
        json.dump(result,fp)
    return result

# Peng Laptop 2014

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["lap14"]["train"],
        eval_dataset = peng_tok["lap14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["lap14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 4530
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 1420
  Number of trainable parameters = 125200128
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 219
  Batch size = 16
  Num examples = 219
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-142
Configuration saved in ./output/checkpoint-142/config.json
Model weights saved in ./output/checkpoint-142/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-142/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-142/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 219
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-284
Configuration saved in ./output/checkpoint-284/config.json
Model weights saved in ./output/checkpoint-284/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-284/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-284/special_tokens_map.json
Saving model checkpoint to ./output/checkpoint-426
Configuration saved in ./output/checkpoint-426/config.json
Model weights saved in ./output/checkpoint-426/pytorc

TrainOutput(global_step=1420, training_loss=6.237962491747359, metrics={'train_runtime': 578.5929, 'train_samples_per_second': 78.293, 'train_steps_per_second': 2.454, 'total_flos': 3346180256102400.0, 'train_loss': 6.237962491747359, 'epoch': 10.0})

In [None]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["lap14"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [00:02,  3.69it/s]


In [None]:
targets = [catch_answer(el,"oas") for el in peng_2["lap14"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0, 'f1_score': 0}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
del model
torch.cuda.empty_cache()

In [None]:
result = save_result(str_preds, preds, targets, "peng_lap14.json")

# Peng Restaurant 2014

In [53]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res14"]["train"],
        eval_dataset = peng_tok["res14"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res14"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--EleutherAI--gpt-neo-125m/snapshots/b983397156c0991016feccfbcbe1fe2746d47b29/config.json
Model config GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "resid_dropout"

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=1980, training_loss=20.77789368292298, metrics={'train_runtime': 926.9541, 'train_samples_per_second': 68.288, 'train_steps_per_second': 2.136, 'total_flos': 5437342036684800.0, 'train_loss': 20.77789368292298, 'epoch': 10.0})

In [54]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res14"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

16it [01:37,  6.12s/it]


In [55]:
targets = [catch_answer(el,"oas") for el in peng_2["res14"]["test"]["output"]]

In [56]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0, 'f1_score': 0}

In [57]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [58]:
result = save_result(str_preds, preds, targets, "peng_res14.json")

# Peng Restaurant 2015

In [59]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res15"]["train"],
        eval_dataset = peng_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--EleutherAI--gpt-neo-125m/snapshots/b983397156c0991016feccfbcbe1fe2746d47b29/config.json
Model config GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "resid_dropout"

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=950, training_loss=6.7153996196546055, metrics={'train_runtime': 441.5222, 'train_samples_per_second': 68.513, 'train_steps_per_second': 2.152, 'total_flos': 2592211071320064.0, 'train_loss': 6.7153996196546055, 'epoch': 10.0})

In [60]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [01:03,  5.77s/it]


In [61]:
targets = [catch_answer(el,"oas") for el in peng_2["res15"]["test"]["output"]]

In [62]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0, 'f1_score': 0}

In [63]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [64]:
result = save_result(str_preds, preds, targets, "peng_res15.json")

# Peng Restaurant 2016

In [65]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = peng_tok["res16"]["train"],
        eval_dataset = peng_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,peng_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--EleutherAI--gpt-neo-125m/snapshots/b983397156c0991016feccfbcbe1fe2746d47b29/config.json
Model config GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "resid_dropout"

Epoch,Training Loss,Validation Loss


In [None]:
str_preds = generate_predictions(model, tokenizer_en, peng_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

11it [01:05,  5.92s/it]


In [None]:
targets = [catch_answer(el,"oas") for el in peng_2["res16"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.005836575875486381,
 'precision': 0.0182648401826484,
 'f1_score': 0.008846295613711757}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "peng_res16.json")

# Wan Restaurant 2015

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = wan_tok["res15"]["train"],
        eval_dataset = wan_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfo

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 10
  Batch size = 16
  Num examples = 10
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-175
Configuration saved in ./output/checkpoint-175/config.json
Saving model checkpoint to ./output/checkpoint-175
Configuration saved in ./output/checkpoint-175/config.json
Model weights saved in ./output/checkpoint-175/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-175/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-175/special_tokens_map.json
Saving model checkpoint to ./output/checkpoint-350
Configuration saved in ./output/checkpoint-350/config.json
Model weights saved in ./output/checkpoint-350/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-350/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-350/special_tokens_map.json
Saving model checkpoint to ./output/checkpoint-525
Configuration saved in ./output/checkpoint-525/config.json
Model weights s

TrainOutput(global_step=3500, training_loss=2.9796246425083703, metrics={'train_runtime': 1834.0091, 'train_samples_per_second': 61.068, 'train_steps_per_second': 1.908, 'total_flos': 1.0602748901376e+16, 'train_loss': 2.9796246425083703, 'epoch': 20.0})

In [None]:
str_preds = generate_predictions(model, tokenizer_en, wan_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"asc") for el in str_preds]

19it [02:16,  7.20s/it]


In [None]:
targets = [catch_answer(el,"asc") for el in wan_2["res15"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.11952662721893491,
 'precision': 0.14647577092511013,
 'f1_score': 0.13163606787101959}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "wan_res15.json")

# Wan Restaurant 2016

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = wan_tok["res16"]["train"],
        eval_dataset = wan_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,wan_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfo

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=5340, training_loss=4.955751294768258, metrics={'train_runtime': 2770.7722, 'train_samples_per_second': 61.643, 'train_steps_per_second': 1.927, 'total_flos': 1.6023935955456e+16, 'train_loss': 4.955751294768258, 'epoch': 20.0})

In [None]:
str_preds = generate_predictions(model, tokenizer_en, wan_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"asc") for el in str_preds]

19it [01:47,  5.65s/it]


In [None]:
targets = [catch_answer(el,"asc") for el in wan_2["res16"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0, 'f1_score': 0}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "wan_res16.json")

# Zhang Restaurant 2015

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = zhang_tok["res15"]["train"],
        eval_dataset = zhang_tok["res15"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2["res15"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfo

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=3640, training_loss=3.3605101784506997, metrics={'train_runtime': 1986.8205, 'train_samples_per_second': 58.767, 'train_steps_per_second': 1.832, 'total_flos': 1.1273477382144e+16, 'train_loss': 3.3605101784506997, 'epoch': 20.0})

In [None]:
str_preds = generate_predictions(model, tokenizer_en, zhang_tok["res15"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oasc") for el in str_preds]

17it [02:11,  7.73s/it]


In [None]:
targets = [catch_answer(el,"oasc") for el in zhang_2["res15"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0.0, 'f1_score': 0}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "zhang_res15.json")

# Zhang Restaurant 2016

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2-large")
if resize_en:
    model.resize_token_embeddings(len(tokenizer_en))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_en,
        data_collator = data_collator_en,
        train_dataset = zhang_tok["res16"]["train"],
        eval_dataset = zhang_tok["res16"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_en,zhang_2["res16"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfo

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 316
  Batch size = 16
  Num examples = 316
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-276
Configuration saved in ./output/checkpoint-276/config.json
Model weights saved in ./output/checkpoint-276/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-276/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-276/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 316
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-552
Configuration saved in ./output/checkpoint-552/config.json
Model weights saved in ./output/checkpoint-552/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-552/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-552/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 316
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-828
Configuration saved in ./output/checkpoint

TrainOutput(global_step=5520, training_loss=4.20832051816194, metrics={'train_runtime': 3280.4895, 'train_samples_per_second': 53.943, 'train_steps_per_second': 1.683, 'total_flos': 1.910844960768e+16, 'train_loss': 4.20832051816194, 'epoch': 20.0})

In [None]:
str_preds = generate_predictions(model, tokenizer_en, zhang_tok["res16"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oasc") for el in str_preds]

17it [00:30,  1.79s/it]


In [None]:
targets = [catch_answer(el,"oasc") for el in zhang_2["res16"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0.0, 'f1_score': 0}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "zhang_res16.json")

# William Hotel

In [47]:
from transformers import GPT2LMHeadModel

In [49]:
model = AutoModelForCausalLM.from_pretrained("facebook/xglm-564M")
if resize_id:
    model.resize_token_embeddings(len(tokenizer_id))
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_id,
        data_collator = data_collator_en,
        train_dataset = william_tok["hotel"]["train"],
        eval_dataset = william_tok["hotel"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_id,william_2["hotel"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 15000
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 4690
  Number of trainable parameters = 125198592
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [63,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [63,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.c

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
str_preds = generate_predictions(model, tokenizer_id, william_tok["hotel"]["test"], device, 32, 512, decoding_args)
preds = [catch_answer(el,"oas") for el in str_preds]

32it [08:06, 15.19s/it]


In [None]:
targets = [catch_answer(el,"oas") for el in william_2["hotel"]["test"]["output"]]

In [None]:
summary_score(preds,targets)

{'recall': 0.0, 'precision': 0.0, 'f1_score': 0}

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
result = save_result(str_preds, preds, targets, "william_hotel.json")