In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 56.6 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 60.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [2]:
import pandas as pd

def open_file(path):
    f = open(path,'r',encoding="utf-8")
    data = f.read()
    f.close()
    data = data.strip().split('\n')
    return data

available_task = ["aste","triplet-detection"]

def read_files(paths,task):
    if task not in available_task:
        raise ValueError(f"Only insert available task. Available task : {available_task}")
    data = []
    for path in paths:
        data.extend(open_file(path))
    result = {
        "text" : [],
        "target" : []
    }
    for line in data:
        splitted_line = line.split('####')
        # Input
        text = splitted_line[0]
        # Output
        if task == "aste":
            num_target = eval(splitted_line[1])
            target = process_numtargets(text,num_target)
        else:
            target = 0 if splitted_line == "[]" else 1
        # Append
        result["text"].append(text)
        result["target"].append(target)
    return pd.DataFrame(result)

In [3]:
senttag2word = {'POS': 'positive', 'NEG': 'negative', 'NEU': 'neutral'}

def process_numtargets(text, target):
    sent = text.split()
    res = []
    for tup in target:
        aspect_index, opinion_index, sentiment = tup
        sentiment = senttag2word[sentiment]
        aspect = ' '.join([sent[aspect_index[i]] for i in range(len(aspect_index))])
        opinion = ' '.join([sent[opinion_index[i]] for i in range(len(opinion_index))])
        res.append((aspect,opinion,sentiment))
    return res

In [4]:
def stringify_target(target):
    processed_target = []
    for tup in target:
        concept = tup[0]
        sentiment_marker = tup[1]
        polarity = tup[2]

        processed_target.append(f"{concept} # {sentiment_marker} # {polarity}")
    return " ; ".join(processed_target)

def batch_stringify_target(batch):
  res = [stringify_target(el) for el in batch]
  return res

In [6]:
train = read_files(["train.txt"],"aste")
dev = read_files(["dev.txt"],"aste")

In [7]:
def create_shots(n,prompt,sep=" | ",random_state=None):
  samples = train.sample(n,random_state=random_state)
  stringified_targets = batch_stringify_target(samples["target"].tolist())
  text = samples.text.tolist()
  result = []
  for i in range(samples.shape[0]):
    target = stringified_targets[i] if stringified_targets[i] != "" else "None"
    result.append(text[i] + " " + prompt + " " + target)
  return sep.join(result)

In [8]:
create_shots(3,"=>")

'Saat memulai jualan , saya merasa semuanya akan baik-baik saja . Namun , pada saat penjualan sepi , muncul rasa kuatir soal masa depan usaha yang lagi dibangun . Jadi , bisa belajar juga bagaimana dunia usaha bergerak . => penjualan # sepi # negative | Kasus positif COVID - 19 di Yogyakarta menjadi 75 kasus => None | Blibli yang terbaik . Dari semua online shop yang pernah saya coba Blibli yang paling top . Kalau bisa kasih seribu bintang pasti udah saya kasih seribu . => Blibli # terbaik # positive ; Blibli # paling top # positive ; Blibli # seribu bintang # positive'

In [9]:
def add_shot(text,n,prompt,sep=" | ",random_state=None):
  result = [create_shots(n,prompt,sep,random_state),text]
  result = sep.join(result) + " " + prompt
  return result

In [61]:
dev["processed"] = dev.text.apply(lambda x : add_shot(x,5,"=>",random_state=None))

In [62]:
dev["processed"]

0      Sementara negara di Uni Eropa akan mempertimba...
1      @UNSfess_ Maksud nya tetep aja yg kepotong yg ...
2      " Healing terbaik gw , makan .. hahaha , " tul...
3      Penemuan benda tersebut bisa menjadi objek pen...
4      Helmya abang grab hampir kebawa sama aku doong...
                             ...                        
318    Semangat untuk membangun bangsa bersama GOLKAR...
319    Pinjaman dari Bank Mandiri ini akan memberikan...
320    Hanya PKS akal nya sehat , mereka berpikir jer...
321    @Arraaxx @utbkfess Dari pada cape-cape daftari...
322    cinta banget gak si sama bu susi , kerja nyata...
Name: processed, Length: 323, dtype: object

In [63]:
def tokenize(examples,tokenizer):
    return tokenizer(examples, padding=True)

In [13]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 5.1 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 57.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 48.6 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 53.3 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.

# Model

In [14]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.0 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [15]:
from transformers import XGLMTokenizer, XGLMForCausalLM
model_checkpoint = 'facebook/xglm-564M'

tokenizer = XGLMTokenizer.from_pretrained(model_checkpoint)
model = XGLMForCausalLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/276 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

In [64]:
from datasets import Dataset

dataset = Dataset.from_pandas(dev)
tokenized_dataset = dataset.map(lambda x : tokenize(x["processed"],tokenizer), batched=True, remove_columns = dev.columns.tolist())

  0%|          | 0/1 [00:00<?, ?ba/s]

In [65]:
import torch
from tqdm import tqdm

# for i in range(len(tokenized_dataset)):
#   tokenized_dataset[i]["input_ids"] = torch.Tensor(tokenized_dataset[i]["input_ids"])
# tokenized_dataset["input_ids"] = torch.Tensor(tokenized_dataset["input_ids"])
device = torch.device('cuda:0')

input_ids = torch.IntTensor(tokenized_dataset["input_ids"]).to(device)

data_loader = torch.utils.data.DataLoader(input_ids,batch_size=4,shuffle=False)

In [67]:
res = []
max_len = len(input_ids[0])
model.to(device)
# for i in range(len(input_ids)):
#   len_i = len(input_ids[i])
#   output = model.generate(input_ids[1:1+1],max_length=len_i)
#   res.append(output)
for batch in tqdm(data_loader):
  
  output = model.generate(input_ids=batch.to(device),max_length=max_len+128)
  res.extend(output)

# res = tokenizer.batch_decode(res,special_tokens=True)

100%|██████████| 81/81 [05:54<00:00,  4.37s/it]


In [69]:
text_result = []
# batch = 1
# for i in range(0,len(res),batch):
#   text_res = tokenizer.batch_decode(res[i:i+batch],skip_special_tokens=True)
#   text_result.extend(text_res)
for r in res:
  text_res = tokenizer.decode(r,skip_special_tokens=True)
  text_result.append(text_res)

In [70]:
dev["result"] = text_result

In [71]:
detokenized_input = []
for i in input_ids:
  a = tokenizer.decode(i,skip_special_tokens=True)
  detokenized_input.append(a)

In [72]:
dev["detokenized"] = detokenized_input

In [73]:
def is_equal(detokenized,generated):
  len_generated = len(generated)
  return detokenized[:len_generated] == generated

In [74]:
resulting_condition = dev.apply(lambda x : is_equal(x["detokenized"],x["result"]),axis=1)

In [78]:
def cut_result(detokenized,result):
  len_detokenized = len(detokenized)
  cutted_result = result[len_detokenized:]
  return cutted_result.split("|")[0].strip()

In [79]:
dev["triplet_result"] = dev.apply(lambda x : cut_result(x["detokenized"],x["result"]),axis=1)

In [80]:
dev["triplet_result"].unique()

array(['. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>. =>.',
       '# terbit # 2016 # negative', '', 'kurangnya # negative',
       'nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan bagaimana cara mengatasinya. =>nampaknya, pemerintah sudah mulai memikirkan',
       '️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️️',
       '# pembeli 

In [81]:
def inverse_stringify_target(stringified_target):
    if stringified_target.strip() == '':
        return []
    target = stringified_target.split(';')
    inverse_stringified_target = []
    for i in range(len(target)):
        try:
            triplet = target[i].strip().split('#')
            aspect, sentiment_marker, polarity = tuple(el.strip() for el in triplet)
            inverse_stringified_target.append((aspect, sentiment_marker, polarity))
        except:
            pass
    return inverse_stringified_target

def batch_inverse_stringify_target(batch):
    res = [inverse_stringify_target(el) for el in batch]
    return res

In [83]:
dev["triplet_result_unstring"] = batch_inverse_stringify_target(dev["triplet_result"].tolist())

In [84]:
from typing import List, Tuple, Dict
import nltk

import pandas as pd
import numpy as np
import torch
import tqdm

import json

def partial_edit_score(text : str, tuple1 : Tuple, tuple2 : Tuple) -> float:
    """
    [DESC]
        Compute the edit score between two tuples
    [PARAMS]
        text : str
        tuple1 : tuple
        tuple2 : tuple
    [RETURNS]
        score : float
    """
    score = 0
    target_true = tuple1[0]
    target_pred = tuple2[0]

    try:
        marker_true = tuple1[1]
        marker_pred = tuple2[1]
    except Exception as e:
        print("Text :",text)
        print("Tuple 1:",tuple1)
        print("Tuple 2:",tuple2)
        raise e

    sentiment_true = tuple1[2]
    sentiment_pred = tuple2[2]
    # check polarity
    if sentiment_true != sentiment_pred:
        return 0
    # check the concept
    if target_pred.lower() in text.lower(): # if the concept is in the text
        levenshtein_distance = nltk.edit_distance(target_true, target_pred)
        denom = max(len(target_true), len(target_pred))
        score += 1 - (levenshtein_distance / denom)
    # check the sentiment marker
    if marker_pred.lower() in text.lower(): # make sure the sentiment marker is in the text
        levenshtein_distance = nltk.edit_distance(marker_true, marker_pred)
        denom = max(len(marker_true), len(marker_pred))
        score += 1 - (levenshtein_distance / denom)
    return score/2

def edit_score(text : str, y_true : List[Tuple], y_pred : List[Tuple]) -> float:
    len_y_true = len(y_true)
    len_y_pred = len(y_pred)
    if len_y_true == 0 and len_y_pred == 0:
        return 0 # before I give 1 point
    
    if len_y_true == 0 or len_y_pred == 0:
        return 0

    score_matrix = []
    for i in range(len_y_true):
        score_matrix.append([])
        for j in range(len_y_pred):
            score_matrix[i].append(partial_edit_score(text,y_true[i],y_pred[j]))
    score = np.max(score_matrix, axis=1).sum()
    return score/max(len_y_true,len_y_pred)

def evaluate(sents : List[str], pred_pt : List[List[Tuple]], gold_pt : List[List[Tuple]]) -> Dict[str,float]:
    """
    [DESC]
        Function to compute F1 scores with pred and gold pairs/triplets
        The input needs to be already processed
    [PARAMS]
        sents : List[str]
        pred_pt : List[List[Tuple]]
        gold_pt : List[List[Tuple]]
    """
    try:
        assert len(sents) == len(pred_pt) == len(gold_pt)
    except AssertionError as e:
        print("Length sent:",len(sents))
        print("Sent 0:",sents[0])
        print("Length preds:",len(pred_pt))
        print("Preds 0:",pred_pt[0])
        print("Length target:",len(gold_pt))
        print("Target 0:",gold_pt[0])
        raise e
    # number of true postive, gold standard, predicted aspect terms
    n_tp, n_gold, n_pred = 0, 0, 0

    total_edit_score = 0

    n_label_not_blank = len([el for el in gold_pt if len(el) > 0])
    n_pred_not_blank = len([el for el in pred_pt if len(el) > 0])

    for i in range(len(pred_pt)):
        n_gold += len(gold_pt[i])
        n_pred += len(pred_pt[i])

        if len(gold_pt[i]) > 0:
            total_edit_score += edit_score(sents[i], gold_pt[i], pred_pt[i])

        for t in pred_pt[i]:
            if t in gold_pt[i]:
                n_tp += 1

    precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
    recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    
    edit_score_recall = (total_edit_score / n_label_not_blank) if n_label_not_blank != 0 else 0
    edit_score_precision = (total_edit_score / n_pred_not_blank) if n_pred_not_blank != 0 else 0
    edit_score_f1 = 2 * edit_score_precision * edit_score_recall / (edit_score_precision + edit_score_recall) if edit_score_precision + edit_score_recall != 0 else 0
    scores = {'precision': precision, 'recall': recall, 'f1': f1, 'edit_recall' : edit_score_recall, 'edit_precision' : edit_score_precision, 'edit_f1': edit_score_f1}
    return scores

In [85]:
dev.columns

Index(['text', 'target', 'processed', 'result', 'detokenized',
       'triplet_result', 'triplet_result_unstring'],
      dtype='object')

In [86]:
evaluate(dev["text"],dev["triplet_result_unstring"],dev["target"])

{'precision': 0.0,
 'recall': 0.0,
 'f1': 0,
 'edit_recall': 0.0012717474017783618,
 'edit_precision': 0.004842422799079147,
 'edit_f1': 0.002014447884416925}