In [1]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "5"
n_gpu = torch.cuda.device_count()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tasks = {
    "single" : ['a', 'c', 'o'],
    "simple" : ["oa", "as", "sc"],
    "complex" : ["oasc"]
}

In [3]:
combination_tasks = [
    tasks["simple"],
    tasks["complex"],
    tasks["single"] + tasks["simple"],
    tasks["single"] + tasks["complex"],
    tasks["simple"] + tasks["complex"],
    tasks["single"] + tasks["simple"] + tasks["complex"]
]

In [4]:
combination_tasks

[['oa', 'as', 'sc'],
 ['oasc'],
 ['a', 'c', 'o', 'oa', 'as', 'sc'],
 ['a', 'c', 'o', 'oasc'],
 ['oa', 'as', 'sc', 'oasc'],
 ['a', 'c', 'o', 'oa', 'as', 'sc', 'oasc']]

In [5]:
all_task = combination_tasks[-1]
print(all_task)

['a', 'c', 'o', 'oa', 'as', 'sc', 'oasc']


In [6]:
import sys
sys.path.append("../../../src/")
import data_utils

directory = dict(
    res15 = "../../../data/absa/en/zhang/interim/interim_2/rest15",
    res16 = "../../../data/absa/en/zhang/interim/interim_2/rest16"
)

In [7]:
data = dict(
    res15 = dict(
        train = data_utils.read_data(path=directory["res15"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=directory["res15"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=directory["res15"] + "/test.txt",
                                     target_format="acso")
    ),
    res16 = dict(
        train = data_utils.read_data(path=directory["res16"] + "/train.txt",
                                     target_format="acso"),
        val = data_utils.read_data(path=directory["res16"] + "/dev.txt",
                                     target_format="acso"),
        test = data_utils.read_data(path=directory["res16"] + "/test.txt",
                                     target_format="acso")
    )
)

In [8]:
from copy import deepcopy

data_intermediate = dict()

for domain, v1 in data.items():
    data_intermediate[domain] = dict()
    for task in all_task:
        data_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = data[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            data_intermediate[domain][task][split] = ds_copy

In [9]:
data_intermediate

{'res15': {'a': {'train': [{'text': "The wait here is long for dim sum , but if you do n ' t like sharing tables or if the typical raucous dim sum atmosphere is not your gig , this is a sleek ( for Chinatown ) alternative .",
     'target': [{'aspect': 'wait'},
      {'aspect': 'atmosphere'},
      {'aspect': 'NULL'}]},
    {'text': "Just because it ' s cheap does NOT mean the portions are small or the food is nasty , IT IS GREAT !",
     'target': [{'aspect': 'food'}, {'aspect': 'NULL'}]},
    {'text': 'Food is excellent .', 'target': [{'aspect': 'Food'}]},
    {'text': 'As always we had a great glass of wine while we waited .',
     'target': [{'aspect': 'glass of wine'}]},
    {'text': 'I can not imagine a friendlier staff working in a restaurant .',
     'target': [{'aspect': 'staff'}]},
    {'text': "Also , specify if you like your food spicy - its rather bland if you do n ' t .",
     'target': [{'aspect': 'food'}]},
    {'text': 'Big Wong gets big Ups for a fine establishment .'

In [10]:
mask = "<extra_id_X>"

In [11]:
# def construct_answer(targets,se_order):
#     result = []
#     for t in targets:
#         constructed_t = []
#         for se in se_order:
#             element = t[data_utils.SENTIMENT_ELEMENT[se]]
#             for k, v in added_tokens.items():
#                 element = element.replace(k,v)
#             constructed_t.append(element)
#         constructed_t = " , ".join(constructed_t)
#         constructed_t = f"( {constructed_t} )"
#         result.append(constructed_t)
#     result = " ; ".join(result)
#     return result

def construct_answer(targets,se_order):
    result = []
    counter = 0
    for t in targets:
        constructed_t = ""
        for se in se_order:
            counter = counter % 100
            constructed_t += ' ' + mask.replace('X',str(counter)) + ' ' + t[data_utils.SENTIMENT_ELEMENT[se]]
            counter += 1
        constructed_t = constructed_t.strip()
        result.append(constructed_t)
    result = " ; ".join(result)
    return result

In [12]:
# def construct_prompt(text,se_order):
#     prompt = []
#     for se in se_order:
#         prompt.append(data_utils.SENTIMENT_ELEMENT[se])
#     prompt = " , ".join(prompt)
#     prompt = f"( {prompt} )"
#     masked_text = text
#     for k, v in added_tokens.items():
#         masked_text = masked_text.replace(k,v)
#     result = masked_text + " | " + prompt
#     return result

def construct_prompt(text,se_order):
    prompt = []
    for counter, se in enumerate(se_order):
        prompt.append(data_utils.SENTIMENT_ELEMENT[se] + " : " + mask.replace('X',str(counter)))
    prompt = " ,".join(prompt)
    result = text + "| " + prompt
    return result

In [13]:
import re

# def catch_answer(output,se_order):
#     output = output.replace("<pad>",'')
#     output = output.replace("</s>",'')
#     pattern = []
#     for se in se_order:
#         if se != 's':
#             pattern.append(f"\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*")
#         else:
#             pattern.append(f"\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*")
#     pattern = ','.join(pattern)
#     pattern = f"\({pattern}\)"
#     found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
#     for i in range(len(found)):
#         for k, v in found[i].items():
#             found[i][k] = found[i][k].strip()
#     return found

def catch_answer(output,se_order):
    output = output.replace("<pad>",'')
    output = output.replace("</s>",'')
    pattern = r""
    for se in se_order:
        if se != 's':
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*"
        else:
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*"
    found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
    for i in range(len(found)):
        for k, v in found[i].items():
            found[i][k] = found[i][k].strip()
    return found

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
encoding_args = {
    "max_length" : 128,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

In [16]:
def encode(dataset):
    result = tokenizer(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [17]:
from datasets import Dataset

def create_data_2(tasks):
    data_2 = {
        "train" : [],
        "val" : [],
        "test" : []
    }
    for domain, v1 in data_intermediate.items():
        # data_2[domain] = {
        #     "train" : [], # basic task
        #     "val" : [], # complex task
        #     "test" : [] # complex task
        # }
        # TRAIN
        for basic_task in tasks:
            for el in data_intermediate[domain][basic_task]["train"]:
                data_2["train"].append({
                        "input" : construct_prompt(el["text"],basic_task),
                        "output" : construct_answer(el["target"],basic_task),
                        "task" : basic_task
                    })
        # VAL
        for el in data_intermediate[domain]["oasc"]["val"]:
            data_2["val"].append({
                    "input" : construct_prompt(el["text"],"oasc"),
                    "output" : construct_answer(el["target"],"oasc"),
                    "task" : "oasc"
                })
        # TEST
        for el in data_intermediate[domain]["oasc"]["test"]:
            data_2["test"].append({
                    "input" : construct_prompt(el["text"],"oasc"),
                    "output" : construct_answer(el["target"],"oasc"),
                    "task" : "oasc"
                })
    data_2["train"] = Dataset.from_list(data_2["train"])
    data_2["val"] = Dataset.from_list(data_2["val"])
    data_2["test"] = Dataset.from_list(data_2["test"])
    
    data_tok = dict()
    # for domain, v1 in data_2.items():
    for split, v1 in data_2.items():
        if split != "test":
            data_tok[split] = data_2[split].map(encode,batched=True,remove_columns=["input","output","task"])
        else:
            data_tok[split] = encode(data_2[split])
    
    return data_2, data_tok

In [18]:
data_2, data_tok = create_data_2(combination_tasks[0])

                                                                  

In [19]:
data_2

{'train': Dataset({
     features: ['input', 'output', 'task'],
     num_rows: 6294
 }),
 'val': Dataset({
     features: ['input', 'output', 'task'],
     num_rows: 525
 }),
 'test': Dataset({
     features: ['input', 'output', 'task'],
     num_rows: 1081
 })}

In [20]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [21]:
from transformers import EvalPrediction
from evaluation import recall, precision, f1_score, summary_score
from typing import List, Dict, Tuple
import numpy as np

def seperate_target_prediction_per_task(predictions:List[List[Dict]],targets:List[List[Dict]],tasks:List) -> Tuple[Dict[str,List],Dict[str,List]]:
    per_task_targets = {}
    per_task_predictions = {}
    for target, prediction, task in zip(targets,predictions,tasks):
        if task not in per_task_targets.keys():
            per_task_targets[task] = []
        if task not in per_task_predictions.keys():
            per_task_predictions[task] = []
        per_task_targets[task].append(target)
        per_task_predictions[task].append(prediction)
    return per_task_targets, per_task_predictions

def preprocess_eval_preds(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer):
    input_ids = eval_preds.inputs
    target_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions

    # In case the model returns more than the prediction logits
    if isinstance(input_ids, tuple):
        input_ids = input_ids[0]
    if isinstance(target_ids, tuple):
        target_ids = target_ids[0]
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    input_ids = np.argmax(input_ids,axis=-1) if len(input_ids.shape) == 3 else input_ids # in case not predict with generate
    target_ids = np.argmax(target_ids,axis=-1) if len(target_ids.shape) == 3 else target_ids # in case not predict with generate
    prediction_ids = np.argmax(pred_ids,axis=-1) if len(pred_ids.shape) == 3 else pred_ids # in case not predict with generate

    input_ids = [[token for token in row if token != -100] for row in input_ids]
    target_ids = [[token for token in row if token != -100] for row in target_ids]
    prediction_ids = [[token for token in row if token != -100] for row in prediction_ids]

    inputs = tokenizer.batch_decode(input_ids,**decoding_args)
    targets = tokenizer.batch_decode(target_ids,**decoding_args)
    predictions = tokenizer.batch_decode(prediction_ids,**decoding_args)

    return inputs, targets, predictions

def compute_metrics(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer,tasks:List) -> Dict[str,float]: # MAY NOT BE SUFFICIATE FOR CAUSAL LM
        """
        ### DESC
            Method to compute the metrics.
        ### PARAMS
        * eval_preds: EvalPrediction instance from training.
        * decoding_args: Decoding arguments.
        ### RETURN
        * metrics: Dictionary of metrics.
        """
        inputs, targets, predictions = preprocess_eval_preds(eval_preds,decoding_args,tokenizer)

        targets = [catch_answer(text,task) for text,task in zip(targets,tasks) if task != "non_absa"]
        predictions = [catch_answer(text,task) for text,task in zip(predictions,tasks) if task != "non_absa"]


        per_task_targets, per_task_predictions = seperate_target_prediction_per_task(predictions, targets, tasks)
        
        metrics = {}

        metrics["overall_recall"] = recall(predictions,targets)
        metrics["overall_precision"] = precision(predictions,targets)
        metrics["overall_f1_score"] = f1_score(predictions,targets)

        for task in per_task_targets.keys():
            if task == "non_absa":
                continue
            metrics[f"{task}_recall"] = recall(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_precision"] = precision(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_f1_score"] = f1_score(per_task_predictions[task],per_task_targets[task])
        
        return metrics

In [22]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 10,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 2,
    "per_device_train_batch_size": 16//n_gpu,
    "per_device_eval_batch_size": 16//n_gpu,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./output",
    "logging_dir" : "./output/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

In [23]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [24]:
from transformers import Seq2SeqTrainer

decoding_args = {
    "skip_special_tokens" : False
}

def preprocess_logits_for_metrics(logits, targets):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, targets

In [25]:
from tqdm import tqdm
from typing import List, Dict

def generate_predictions(model,tokenizer,data,device=torch.device("cuda:0"),decoding_args:Dict={}) -> List[str]:
    # Data loader
    # input_ids_data_loader = torch.utils.data.DataLoader(tokenized["input_ids"],
    #                     batch_size=batch_size,shuffle=False)
    # attention_mask_data_loader = torch.utils.data.DataLoader(tokenized["attention_mask"],
    #                     batch_size=batch_size,shuffle=False)
    # Predict
    model = model
    tokenizer = tokenizer
    tensor_predictions = []
    with torch.no_grad():
        for text in tqdm(data):
            # input_ids = input_ids.to(device)
            # attention_mask = attention_mask.to(device)
            input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
            tensor_predictions.extend(model.generate(input_ids=input_ids, pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id,max_length=128).cpu())
            input_ids = input_ids.cpu()
            # attention_mask = attention_mask.cpu()
    tensor_predictions = [[token for token in row if token != -100] for row in tensor_predictions]
    predictions = tokenizer.batch_decode(tensor_predictions,**decoding_args)
    predictions = [el for el in predictions]
    return predictions

In [26]:
import json

def save_result(str_preds_,preds,targets,filename):
    result = []
    str_preds = [el.replace("<pad>",'').replace("</s>",'') for el in str_preds_]
    assert len(str_preds) == len(preds) == len(targets)
    for i in range(len(str_preds)):
        result.append({
            "str_pred" : str_preds[i],
            "pred" : preds[i],
            "target" : targets[i]
        })
    
    with open(filename,'w') as fp:
        json.dump(result,fp)
    return result

In [29]:
from transformers import AutoModelForSeq2SeqLM
import torch

for combo_task in combination_tasks:
    data_2, data_tok = create_data_2(combo_task)
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
    model.to(device)
    trainer = Seq2SeqTrainer(
            model = model,
            args = train_args,
            tokenizer = tokenizer,
            data_collator = data_collator,
            train_dataset = data_tok["train"],
            eval_dataset = data_tok["val"],
            compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer,data_2["val"]["task"]),
            preprocess_logits_for_metrics = preprocess_logits_for_metrics
        )

    trainer.train()

    str_preds = generate_predictions(model, tokenizer, data_2["test"]["input"], device, decoding_args)
    preds = [catch_answer(el,"oasc") for el in str_preds]
    targets = [catch_answer(el,"oasc") for el in data_2["test"]["output"]]
    score = summary_score(preds,targets)

    del model
    torch.cuda.empty_cache()

    fname = '-'.join(combo_task) + ".json"

    result = save_result(str_preds, preds, targets, fname)

loading configuration file config.json from cache at /home/m13519061/.cache/huggingface/hub/models--t5-large/snapshots/150ebc2c4b72291e770f58e6057481c8d2ed331a/config.json
Model config T5Config {
  "_name_or_path": "t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 525
  Batch size = 16
  Num examples = 525
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-66
Configuration saved in ./output/checkpoint-66/config.json
Model weights saved in ./output/checkpoint-66/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-66/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-66/special_tokens_map.json
Copy vocab file to ./output/checkpoint-66/spiece.model
Deleting older checkpoint [output/checkpoint-591] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 525
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-132
Configuration saved in ./output/checkpoint-132/config.json
Model weights saved in ./output/checkpoint-132/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-132/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-132/special_tokens_map.json
Copy vocab file to ./output/checkp

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 525
  Num examples = 525
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-393
Configuration saved in ./output/checkpoint-393/config.json
Model weights saved in ./output/checkpoint-393/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-393/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-393/special_tokens_map.json
Copy vocab file to ./output/checkpoint-393/spiece.model
Deleting older checkpoint [output/checkpoint-594] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 525
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-786
Configuration saved in ./output/checkpoint-786/config.json
Model weights saved in ./output/checkpoint-786/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-786/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-786/special_tokens_map.json
Copy vocab file to ./output/checkpoint-786/spi

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 525
  Batch size = 16
  Num examples = 525
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-262
Configuration saved in ./output/checkpoint-262/config.json
Model weights saved in ./output/checkpoint-262/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-262/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-262/special_tokens_map.json
Copy vocab file to ./output/checkpoint-262/spiece.model
Deleting older checkpoint [output/checkpoint-3144] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 525
  Batch size = 16
Saving model checkpoint to ./output/checkpoint-1572
Configuration saved in ./output/checkpoint-1572/config.json
Model weights saved in ./output/checkpoint-1572/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-1572/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1572/special_tokens_map.json
Copy vocab file to ./o