In [1]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
n_gpu = torch.cuda.device_count()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
import sys
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
sys.path.append("../../../src/")
import data_utils

# Dataset Utilities

In [4]:
william_dir = dict(
    hotel = "../../../data/absa/id/william"
)

william = dict(
    hotel = dict(
        train = data_utils.read_data(path=william_dir["hotel"] + "/train.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=william_dir["hotel"] + "/dev.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=william_dir["hotel"] + "/test.txt",
                                     target_format="aos")
    )
)

# Data Preprocessing 1

1. AOS (ASTE)
    * AO
    * AS
    * A
    * O

2. ACS (TASD)
    * AS
    * CS
    * A
    * C

3. ACOS
    * AO
    * AS
    * CS
    * A
    * O
    * C

In [5]:
task_tree = {
    "oas" : ["oas","oa","as",'a','o'],
    "asc" : ["asc","as","sc",'a','c'],
    "oasc" : ["oasc","oa","as","sc",'a','o','c']
}

all_task = []
for k,v1 in task_tree.items():
    if k not in all_task:
        all_task.append(k)
    for v2 in v1:
        if v2 not in all_task:
            all_task.append(v2)

print(all_task)

['oas', 'oa', 'as', 'a', 'o', 'asc', 'sc', 'c', 'oasc']


In [6]:
def reduce_num_targets(num_targets,og_format,reduced_format):
    result = []
    og_format = list(og_format)
    reduced_format = list(reduced_format)
    rf_index = [og_format.index(el) for el in reduced_format]
    result = [tuple(nt[i] for i in rf_index) for nt in num_targets]
    return result

def remove_duplicates_num_targets(num_targets):
    result = []
    for el in num_targets:
        if el not in result:
            result.append(el)
    return result

In [7]:
from copy import deepcopy

# William (AOS ID)
william_intermediate = dict()

for domain, v1 in william.items():
    william_intermediate[domain] = dict()
    for task in ["oas"] + task_tree["oas"]:
        william_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = william[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                ## TARGET
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
                ## NUM TARGETS
                ds_copy[i]["num_targets"] = reduce_num_targets(ds_copy[i]["num_targets"],"aos",task)
                ds_copy[i]["num_targets"] = remove_duplicates_num_targets(ds_copy[i]["num_targets"])
            william_intermediate[domain][task][split] = ds_copy

# Answer Engineering

In [8]:
# def construct_answer(targets,se_order):
#     if len(targets) == 0:
#         return "NULL"
#     result = []
#     for t in targets:
#         constructed_t = []
#         for se in se_order:
#             element = t[data_utils.SENTIMENT_ELEMENT[se]]
#             for k, v in added_tokens.items():
#                 element = element.replace(k,v)
#             constructed_t.append(element)
#         constructed_t = " , ".join(constructed_t)
#         constructed_t = f"( {constructed_t} )"
#         result.append(constructed_t)
#     result = " ; ".join(result)
#     return result

def construct_answer(num_targets):
    if len(num_targets) == 0:
        return "NULL"
    result = []
    for nt in num_targets:
        for el in nt:
            if isinstance(el,list):
                result.append(str(el[0])) # start index
                result.append(str(el[-1])) # end index
            else:
                result.append(el)
    return ','.join(result)

In [9]:
william_intermediate["hotel"]["oas"]["train"][2]

{'text': 'oke banget , tetapi ac nya tidak bisa diatur suhu nya .',
 'target': [{'aspect': 'ac nya',
   'opinion': 'tidak bisa diatur',
   'sentiment': 'negative'},
  {'aspect': 'NULL', 'opinion': 'oke banget', 'sentiment': 'positive'}],
 'num_targets': [([6, 7, 8], [4, 5], 'NEG'), ([0, 1], [-1], 'POS')]}

In [10]:
construct_answer(william_intermediate["hotel"]["oas"]["train"][0]["num_targets"])

'6,8,5,5,NEG,14,15,12,13,NEG'

# Prompt Engineering

In [11]:
def construct_prompt(text,se_order):
    prompt = []
    for se in se_order:
        if se == 'o' or se == 'a':
            name = data_utils.SENTIMENT_ELEMENT[se]
            start_index = name + "_start"
            end_index = name + "_end"
            prompt.append(start_index)
            prompt.append(end_index)
        else:
            prompt.append(data_utils.SENTIMENT_ELEMENT[se])
    prompt = ",".join(prompt)
    # prompt = f"( {prompt} )"
    # masked_text = text
    # for k, v in added_tokens.items():
    #     masked_text = masked_text.replace(k,v)
    result = text + " | " + prompt
    return result

In [12]:
construct_prompt(william_intermediate["hotel"]["oas"]["train"][0]["text"],"oas")

'kamar saya ada kendala di ac tidak berfungsi optimal . dan juga wifi koneksi kurang stabil . | opinion_start,opinion_end,aspect_start,aspect_end,sentiment'

# Answer Catch

In [13]:
import re

def catch_answer(output,se_order,text):
    splitted_text = text.split()
    if output == "NULL":
        return []
    result = []
    splitted_output = output.split(',')
    splitted_output = [el.strip() for el in splitted_output]

    chunk_size = 0
    for se in se_order:
        if se == 'o' or se == 'a':
            chunk_size += 2
        else:
            chunk_size += 1

    chunks = [
        splitted_output[i:i+chunk_size] for i in range(0,len(splitted_output),chunk_size)
    ]

    chunks = [el for el in chunks if len(el) == chunk_size]

    for el in chunks:
        pred = {}
        cnt_index = 0
        is_invalid = False
        for se in se_order:
            if se == 'a' or se == 'o':
                start_index = el[cnt_index]
                end_index = el[cnt_index+1]
                cnt_index += 2

                try:
                    start_index = int(start_index)
                    end_index = int(end_index)
                    if end_index < start_index:
                        start_index, end_index = end_index, start_index
                    if start_index == -1 or end_index == -1:
                        pred[data_utils.SENTIMENT_ELEMENT[se]] = "NULL"
                    else:
                        word = splitted_text[start_index:end_index+1]
                        word = ' '.join(word)
                        pred[data_utils.SENTIMENT_ELEMENT[se]] = word
                except:
                    is_invalid = True
                    break
            elif se == 's':
                try:
                    sentiment = data_utils.SENTTAG2WORD[el[cnt_index]]
                    pred[data_utils.SENTIMENT_ELEMENT['s']] = sentiment
                except:
                    is_invalid = True
                    pass
                cnt_index += 1
            else: # c
                pred[data_utils.SENTIMENT_ELEMENT[se]] = el[cnt_index]
                cnt_index += 1
        if not is_invalid:
            result.append(pred)
    return result

In [14]:
ans = construct_answer(william_intermediate["hotel"]["oas"]["train"][0]["num_targets"])
text = william_intermediate["hotel"]["oas"]["train"][0]["text"]

In [15]:
william_intermediate["hotel"]["oas"]["train"][0]["target"]

[{'aspect': 'ac',
  'opinion': 'tidak berfungsi optimal',
  'sentiment': 'negative'},
 {'aspect': 'wifi koneksi',
  'opinion': 'kurang stabil',
  'sentiment': 'negative'}]

In [16]:
catch_answer(ans,"oas",text)

[{'opinion': 'tidak berfungsi optimal',
  'aspect': 'ac',
  'sentiment': 'negative'},
 {'opinion': 'kurang stabil',
  'aspect': 'wifi koneksi',
  'sentiment': 'negative'}]

# Data Preprocessing 2

In [17]:
from datasets import Dataset

william_2 = dict()
for domain, v1 in william_intermediate.items():
    william_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in task_tree["oas"]:
        for el in william_intermediate[domain][basic_task]["train"]:
            william_2[domain]["train"].append({
                    "input" : construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["num_targets"]),
                    "task" : basic_task
                })
    # VAL
    for el in william_intermediate[domain]["oas"]["val"]:
        william_2[domain]["val"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["num_targets"]),
                "task" : "oas"
            })
    # TEST
    for el in william_intermediate[domain]["oas"]["test"]:
        william_2[domain]["test"].append({
                "input" : construct_prompt(el["text"],"oas"),
                "output" : construct_answer(el["num_targets"]),
                "task" : "oas"
            })
    william_2[domain]["train"] = Dataset.from_list(william_2[domain]["train"])
    william_2[domain]["val"] = Dataset.from_list(william_2[domain]["val"])
    william_2[domain]["test"] = Dataset.from_list(william_2[domain]["test"])

In [18]:
william_2["hotel"]["train"]["output"][2]

'6,8,4,5,NEG,0,1,-1,-1,POS'

In [19]:
catch_answer(william_2["hotel"]["train"]["output"][2],william_2["hotel"]["train"]["task"][2],william_2["hotel"]["train"]["input"][2].split('|')[0])

[{'opinion': 'tidak bisa diatur', 'aspect': 'ac nya', 'sentiment': 'negative'},
 {'opinion': 'oke banget', 'aspect': 'NULL', 'sentiment': 'positive'}]

# Prepare Tokenized Dataset

In [20]:
encoding_args = {
    "max_length" : 128,
    "padding" : True,
    "truncation" : True,
    "return_tensors" : "pt"
}

In [21]:
tokenizer_id = AutoTokenizer.from_pretrained("google/mt5-base")



In [22]:
def encode_id(dataset):
    result = tokenizer_id(dataset["input"], text_target=dataset["output"], **encoding_args)
    return result

In [23]:
william_tok = dict()
for domain, v1 in william_2.items():
    william_tok[domain] = dict()
    for split, v2 in v1.items():
        if split != "test":
            william_tok[domain][split] = william_2[domain][split].map(encode_id,batched=True,remove_columns=["input","output","task"])
            william_tok[domain][split].set_format("torch")
        else:
            william_tok[domain][split] = encode_id(william_2[domain][split])

                                                                    

# Data Collator

## Indo

In [24]:
from transformers import DataCollatorForSeq2Seq

data_collator_id = DataCollatorForSeq2Seq(tokenizer=tokenizer_id)

# Compute Metrics

In [25]:
from transformers import EvalPrediction
from evaluation import recall, precision, f1_score, summary_score
from typing import List, Dict, Tuple
import numpy as np

def seperate_target_prediction_per_task(predictions:List[List[Dict]],targets:List[List[Dict]],tasks:List) -> Tuple[Dict[str,List],Dict[str,List]]:
    per_task_targets = {}
    per_task_predictions = {}
    for target, prediction, task in zip(targets,predictions,tasks):
        if task not in per_task_targets.keys():
            per_task_targets[task] = []
        if task not in per_task_predictions.keys():
            per_task_predictions[task] = []
        per_task_targets[task].append(target)
        per_task_predictions[task].append(prediction)
    return per_task_targets, per_task_predictions

def preprocess_eval_preds(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer):
    input_ids = eval_preds.inputs
    target_ids = eval_preds.label_ids
    pred_ids = eval_preds.predictions

    # In case the model returns more than the prediction logits
    if isinstance(input_ids, tuple):
        input_ids = input_ids[0]
    if isinstance(target_ids, tuple):
        target_ids = target_ids[0]
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    input_ids = np.argmax(input_ids,axis=-1) if len(input_ids.shape) == 3 else input_ids # in case not predict with generate
    target_ids = np.argmax(target_ids,axis=-1) if len(target_ids.shape) == 3 else target_ids # in case not predict with generate
    prediction_ids = np.argmax(pred_ids,axis=-1) if len(pred_ids.shape) == 3 else pred_ids # in case not predict with generate

    input_ids = [[token for token in row if token != -100] for row in input_ids]
    target_ids = [[token for token in row if token != -100] for row in target_ids]
    prediction_ids = [[token for token in row if token != -100] for row in prediction_ids]

    inputs = tokenizer.batch_decode(input_ids,**decoding_args)
    targets = tokenizer.batch_decode(target_ids,**decoding_args)
    predictions = tokenizer.batch_decode(prediction_ids,**decoding_args)

    return inputs, targets, predictions

def compute_metrics(eval_preds:EvalPrediction,decoding_args:Dict[str,str],tokenizer:AutoTokenizer,tasks:List) -> Dict[str,float]: # MAY NOT BE SUFFICIATE FOR CAUSAL LM
        """
        ### DESC
            Method to compute the metrics.
        ### PARAMS
        * eval_preds: EvalPrediction instance from training.
        * decoding_args: Decoding arguments.
        ### RETURN
        * metrics: Dictionary of metrics.
        """
        inputs, targets, predictions = preprocess_eval_preds(eval_preds,decoding_args,tokenizer)

        print("INPUTS >>",inputs[0])
        print("TARGETS >>",targets[0])
        print("PREDS >>",predictions[0])

        texts = [el.split('|')[0].strip() for el in inputs]

        targets = [catch_answer(ans,task,text) for ans,task,text in zip(targets,tasks,texts) if task != "non_absa"]
        predictions = [catch_answer(ans,task,text) for ans,task,text in zip(predictions,tasks,texts) if task != "non_absa"]

        per_task_targets, per_task_predictions = seperate_target_prediction_per_task(predictions, targets, tasks)
        
        metrics = {}

        metrics["overall_recall"] = recall(predictions,targets)
        metrics["overall_precision"] = precision(predictions,targets)
        metrics["overall_f1_score"] = f1_score(predictions,targets)

        for task in per_task_targets.keys():
            if task == "non_absa":
                continue
            metrics[f"{task}_recall"] = recall(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_precision"] = precision(per_task_predictions[task],per_task_targets[task])
            metrics[f"{task}_f1_score"] = f1_score(per_task_predictions[task],per_task_targets[task])
        
        return metrics

# Train Arguments

In [26]:
from transformers import Seq2SeqTrainingArguments

train_args = {
    "num_train_epochs": 10,
    "learning_rate": 3e-4,
    "save_total_limit": 2,
    "gradient_accumulation_steps": 2,
    "per_device_train_batch_size": 8//n_gpu,
    "per_device_eval_batch_size": 8//n_gpu,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch",
    "logging_strategy" : "epoch",
    "metric_for_best_model": "overall_f1_score",
    "load_best_model_at_end": True,
    "adam_epsilon": 1e-08,
    "output_dir": "./output",
    "logging_dir" : "./output/log",
    "include_inputs_for_metrics" : True
}

train_args = Seq2SeqTrainingArguments(**train_args)

# Train

In [27]:
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [28]:
from transformers import Seq2SeqTrainer

# trainer = {
#     "peng" : {},
#     "wan" : {},
#     "zhang" : {},
#     "william" : {}
# }

decoding_args = {
    "skip_special_tokens" : False
}

def preprocess_logits_for_metrics(logits, targets):
    pred_logits = logits[0] if isinstance(logits,tuple) else logits
    pred_ids = torch.argmax(pred_logits, dim=-1)
    return pred_ids, targets

In [29]:
from tqdm import tqdm

def generate_predictions(model,tokenizer,data,device=torch.device("cuda:0"),decoding_args:Dict={}) -> List[str]:
    # Data loader
    # input_ids_data_loader = torch.utils.data.DataLoader(tokenized["input_ids"],
    #                     batch_size=batch_size,shuffle=False)
    # attention_mask_data_loader = torch.utils.data.DataLoader(tokenized["attention_mask"],
    #                     batch_size=batch_size,shuffle=False)
    # Predict
    model = model
    tokenizer = tokenizer
    tensor_predictions = []
    with torch.no_grad():
        for text in tqdm(data):
            # input_ids = input_ids.to(device)
            # attention_mask = attention_mask.to(device)
            input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
            tensor_predictions.extend(model.generate(input_ids=input_ids, pad_token_id=tokenizer.pad_token_id,eos_token_id=tokenizer.eos_token_id,max_length=128).cpu())
            input_ids = input_ids.cpu()
            # attention_mask = attention_mask.cpu()
    tensor_predictions = [[token for token in row if token != -100] for row in tensor_predictions]
    predictions = tokenizer.batch_decode(tensor_predictions,**decoding_args)
    predictions = [el for el in predictions]
    return predictions

In [30]:
import json

def save_result(str_preds_,preds,targets,filename):
    result = []
    str_preds = [el.replace("<pad>",'').replace("</s>",'') for el in str_preds_]
    assert len(str_preds) == len(preds) == len(targets)
    for i in range(len(str_preds)):
        result.append({
            "str_pred" : str_preds[i],
            "pred" : preds[i],
            "target" : targets[i]
        })
    
    with open(filename,'w') as fp:
        json.dump(result,fp)
    return result

# William Hotel

In [31]:
model = AutoModelForSeq2SeqLM.from_pretrained("/raid/m13519061/ta/facebook-absa/doe/answer_id/bartabsa/output/checkpoint-8433")
model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (w

In [24]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
model.to(device)
trainer = Seq2SeqTrainer(
        model = model,
        args = train_args,
        tokenizer = tokenizer_id,
        data_collator = data_collator_id,
        train_dataset = william_tok["hotel"]["train"],
        eval_dataset = william_tok["hotel"]["val"],
        compute_metrics = lambda eval_preds: compute_metrics(eval_preds,decoding_args,tokenizer_id,william_2["hotel"]["val"]["task"]),
        preprocess_logits_for_metrics = preprocess_logits_for_metrics
    )

trainer.train()

***** Running training *****
  Num examples = 15000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 9370
  Number of trainable parameters = 582401280
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Saving model checkpoint to ./output/checkpoint-937
Configuration saved in ./output/checkpoint-937/config.json
Configuration saved in ./output/checkpoint-937/config.json
Model weights saved in ./output/checkpoint-937/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-937/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-937/special_tokens_map.json
Copy vocab file to ./output/checkpoint-937/spiece.model
Deleting older checkpoint [output/checkpoint-4685] due to args.save_total_limit


Saving model checkpoint to ./output/checkpoint-1874
Configuration saved in ./output/checkpoint-1874/config.json
Configuration saved in ./output/checkpoint-1874/config.json
Model weights saved in ./output/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-1874/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-1874/special_tokens_map.json
Copy vocab file to ./output/checkpoint-1874/spiece.model
Deleting older checkpoint [output/checkpoint-5622] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


Saving model checkpoint to ./output/checkpoint-2811
Configuration saved in ./output/checkpoint-2811/config.json
Configuration saved in ./output/checkpoint-2811/config.json
Model weights saved in ./output/checkpoint-2811/pytorch_model.bin
tokenizer config file saved in ./output/checkpoint-2811/tokenizer_config.json
Special tokens file saved in ./output/checkpoint-2811/special_tokens_map.json
Copy vocab file to ./output/checkpoint-2811/spiece.model
Deleting older checkpoint [output/checkpoint-937] due to args.save_total_limit


In [32]:
str_preds = generate_predictions(model, tokenizer_id, william_2["hotel"]["test"]["input"], device, decoding_args)

100%|██████████| 1000/1000 [05:25<00:00,  3.07it/s]


In [42]:
str_preds = [el.replace('<pad>','').replace('</s>','') for el in str_preds]

In [43]:
str_preds[0]

' 1,1,0,0,POS,4,4,3,3,POS,7,7,6,6,POS,11,12,10,11,NEG'

In [44]:
text = [el.split('|')[0] for el in william_2["hotel"]["test"]["input"]]
preds = [catch_answer(sp,"oas", t) for sp, t in zip(str_preds,text)]

In [45]:
targets = [catch_answer(sp,"oas", t) for sp, t in zip(william_2["hotel"]["test"]["output"],text)]

In [46]:
summary_score(preds,targets)

{'recall': 0.4433370247323736,
 'precision': 0.473808586057503,
 'f1_score': 0.458066606576197}

In [47]:
with open("score.json",'w') as fp:
    json.dump(summary_score(preds,targets),fp)

In [None]:
!rm -rf ./output

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [48]:
result = save_result(str_preds, preds, targets, "william_hotel.json")

In [49]:
result

[{'str_pred': ' 1,1,0,0,POS,4,4,3,3,POS,7,7,6,6,POS,11,12,10,11,NEG',
  'pred': [{'opinion': 'ramah',
    'aspect': 'pelayanan',
    'sentiment': 'positive'},
   {'opinion': 'nyaman', 'aspect': 'kamar', 'sentiment': 'positive'},
   {'opinion': 'lengkap', 'aspect': 'fasilitas', 'sentiment': 'positive'},
   {'opinion': 'showernya kurang',
    'aspect': 'airnya showernya',
    'sentiment': 'negative'}],
  'target': [{'opinion': 'ramah',
    'aspect': 'pelayanan',
    'sentiment': 'positive'},
   {'opinion': 'nyaman', 'aspect': 'kamar', 'sentiment': 'positive'},
   {'opinion': 'lengkap', 'aspect': 'fasilitas', 'sentiment': 'positive'},
   {'opinion': 'kurang panas',
    'aspect': 'airnya showernya',
    'sentiment': 'negative'}]},
 {'str_pred': ' 0,3,-1,-1,NEG',
  'pred': [{'opinion': 'tidak terlalu jauh dari',
    'aspect': 'NULL',
    'sentiment': 'negative'}],
  'target': [{'opinion': 'tidak terlalu jauh',
    'aspect': 'dari pusat kota',
    'sentiment': 'positive'}]},
 {'str_pred': ' 

In [39]:
catch_answer(" 1,1,0,0,POS,4,4,3,3,POS,7,7,6,6,POS,11,12,10,11,NEG","oas",text[0])

[{'opinion': 'ramah', 'aspect': 'pelayanan', 'sentiment': 'positive'},
 {'opinion': 'nyaman', 'aspect': 'kamar', 'sentiment': 'positive'},
 {'opinion': 'lengkap', 'aspect': 'fasilitas', 'sentiment': 'positive'},
 {'opinion': 'showernya kurang',
  'aspect': 'airnya showernya',
  'sentiment': 'negative'}]

In [40]:
str_preds[0]

'<pad> 1,1,0,0,POS,4,4,3,3,POS,7,7,6,6,POS,11,12,10,11,NEG</s>'