In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import T5Tokenizer

model_name_or_path = "/srv/nas_data1/text/randy/absa/models/facebook_research/generative/fix/t5/t5_pabsa_S256_wid_small_blank=1.0"
train_path = "./train.csv"
prediction_news_path = "./news/prediction.csv"
prediction_socmed_path = "./socmed/prediction.csv"

tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
train = pd.read_csv(train_path)
pred_news = pd.read_csv(prediction_news_path)
pred_socmed = pd.read_csv(prediction_socmed_path)

In [51]:
train_aste = train.loc[train.task == "aste"].copy()
train_aste = train_aste[["text","target","input","prompt"]]

In [52]:
pred_news = pred_news[["prompt","text","target","string_preds","raw_prediction"]]
pred_socmed = pred_socmed[["prompt","text","target","string_preds","raw_prediction"]]

In [53]:
vocab = tokenizer.get_vocab()

In [54]:
def len_input(text):
    return len(tokenizer.encode(text))

def oov_percentage(text):
    tokens = tokenizer.tokenize(text)
    n = 0
    for t in tokens:
        if t not in vocab:
            n += 1
    return n/len(tokens)

In [55]:
train_aste["n_token"] = train_aste.input.apply(len_input)
train_aste["oov"] = train_aste.input.apply(oov_percentage)

pred_news["n_token"] = pred_news.apply(lambda x : len_input(x["prompt"] + " " + x["text"]),axis=1)
pred_news["oov"] = pred_news.apply(lambda x : oov_percentage(x["prompt"] + " " + x["text"]),axis=1)

pred_socmed["n_token"] = pred_socmed.apply(lambda x : len_input(x["prompt"] + " " + x["text"]),axis=1)
pred_socmed["oov"] = pred_socmed.apply(lambda x : oov_percentage(x["prompt"] + " " + x["text"]),axis=1)

In [56]:
import re

str_pattern = r'\(\s?(.+)\s?,\s?(.+)\s?,\s?(positive|negative|neutral)\s?\)'

mono = str_pattern
multiple = f"{str_pattern}\s?;?\s?({str_pattern})+"
mono_pattern = re.compile(mono)
multiple_pattern = re.compile(multiple)

In [57]:
def is_valid_generation(text):
    if text.strip() == "NONE":
        return True
    if mono_pattern.match(text) or multiple_pattern.match(text):
        return True
    return False

In [58]:
pred_news["valid_generation"] = pred_news.string_preds.apply(is_valid_generation)
pred_socmed["valid_generation"] = pred_socmed.string_preds.apply(is_valid_generation)

In [59]:
pred_news

Unnamed: 0,prompt,text,target,string_preds,raw_prediction,n_token,oov,valid_generation
0,ekstraksi triplet aste :,"Terakhir , masyarakat sendiri akan memperoleh ...","[{'aspect': 'saham', 'opinion': 'memperoleh', ...",NONE,[],35,0.000000,True
1,ekstraksi triplet aste :,"Sementara itu , konser BTS Permission to Dance...",[],NONE,[],30,0.000000,True
2,ekstraksi triplet aste :,Ketika koneksi internet di rumah tidak stabil ...,"[{'aspect': 'koneksi internet', 'opinion': 'ti...",NONE,[],32,0.000000,True
3,ekstraksi triplet aste :,"Di sisi lain , Indonesia kaya akan sumber daya...","[{'aspect': 'Indonesia', 'opinion': 'kaya akan...","( Indonesia, kaya, positive ) ; ( sumber daya ...","[{'aspect': 'sumber daya gas alam', 'opinion':...",30,0.000000,True
4,ekstraksi triplet aste :,"Agensi yang menaungi BTS , Big Hit Music , men...",[],NONE,[],45,0.000000,True
...,...,...,...,...,...,...,...,...
450,ekstraksi triplet aste :,"Arti nya , segala hal yang dianggap terstruktu...",[{'aspect': 'segala hal yang dianggap terstruk...,NONE,[],32,0.000000,True
451,ekstraksi triplet aste :,""" Inflasi adalah ' ledakan ' Big Bang , "" kata...",[],"( Inflasi, ledakan'Big Bang, negative )","[{'aspect': 'Inflasi', 'opinion': ""ledakan'Big...",33,0.000000,True
452,ekstraksi triplet aste :,"Jin melanjutkan , tidak peduli lelah fisik mau...","[{'aspect': 'perusahaan', 'opinion': 'tetap se...","( ARMY, tetap senang, positive ) ; ( ARMY, bah...","[{'aspect': 'ARMY', 'opinion': 'tetap senang',...",38,0.000000,True
453,ekstraksi triplet aste :,Kini jumlah kasus Covid - 19 secara keseluruha...,"[{'aspect': 'konser', 'opinion': 'berisi penuh...",NONE,[],54,0.000000,True


In [60]:
train_aste = train_aste[["text","target","n_token","oov"]]
pred_news = pred_news[["text","target","string_preds","raw_prediction","n_token","oov","valid_generation"]]
pred_socmed = pred_socmed[["text","target","string_preds","raw_prediction","n_token","oov","valid_generation"]]

In [62]:
train_aste.to_csv("train_aste.csv",index=False)
pred_news.to_csv("pred_news.csv",index=False)
pred_socmed.to_csv("pred_socmed.csv",index=False)

In [64]:
def error_type(row):
    target = row["target"]
    raw_prediction = row["raw_prediction"]
    if target != "[]" and raw_prediction == "[]":
        return 0
    if target == "[]" and raw_prediction != "[]":
        return 1
    return 2

In [65]:
def unique(list_of_dictionary):
    res = []
    for el in list_of_dictionary:
        if el not in res:
            res.append(el)
    return res

def is_equal(row):
    target = eval(row["target"])
    raw_prediction = unique(eval(row["raw_prediction"]))

    if len(target) != len(raw_prediction):
        return False
    
    for t in target:
        if t not in raw_prediction:
            return False
    return True

In [68]:
sample_false_pred_news = pred_news.loc[~pred_news.apply(is_equal,axis=1)].copy()
sample_false_pred_socmed = pred_socmed.loc[~pred_socmed.apply(is_equal,axis=1)].copy()

In [70]:
sample_false_pred_news["error_type"] = sample_false_pred_news.apply(error_type,axis=1)
sample_false_pred_socmed["error_type"] = sample_false_pred_socmed.apply(error_type,axis=1)

In [72]:
sample_false_pred_news = sample_false_pred_news.sort_values(by="error_type")
sample_false_pred_socmed = sample_false_pred_socmed.sort_values(by="error_type")

In [74]:
sample_false_pred_news = pd.concat([
    sample_false_pred_news.loc[sample_false_pred_news.error_type == 0].sample(5,random_state=42),
    sample_false_pred_news.loc[sample_false_pred_news.error_type == 1].sample(5,random_state=42),
    sample_false_pred_news.loc[sample_false_pred_news.error_type == 2].sample(5,random_state=42)
])

sample_false_pred_socmed = pd.concat([
    sample_false_pred_socmed.loc[sample_false_pred_socmed.error_type == 0].sample(5,random_state=42),
    sample_false_pred_socmed.loc[sample_false_pred_socmed.error_type == 1].sample(5,random_state=42),
    sample_false_pred_socmed.loc[sample_false_pred_socmed.error_type == 2].sample(5,random_state=42)
])

In [76]:
sample_false_pred_news = sample_false_pred_news[sample_false_pred_news.columns[:-1]]
sample_false_pred_socmed = sample_false_pred_socmed[sample_false_pred_socmed.columns[:-1]]

In [78]:
sample_false_pred_news.to_csv("./analysis/sample_false_pred_news.csv",index=False)
sample_false_pred_socmed.to_csv("./analysis/sample_false_pred_socmed.csv",index=False)