In [21]:
import sys
sys.path.append("../../../src/")
import data_utils

In [22]:
william_dir = dict(
    hotel = "../../../data/absa/id/william"
)

william = dict(
    hotel = dict(
        train = data_utils.read_data(path=william_dir["hotel"] + "/train.txt",
                                     target_format="aos"),
        val = data_utils.read_data(path=william_dir["hotel"] + "/dev.txt",
                                     target_format="aos"),
        test = data_utils.read_data(path=william_dir["hotel"] + "/test.txt",
                                     target_format="aos")
    )
)

In [23]:
william["hotel"]["train"] = [el for el in william["hotel"]["train"] if len(el["target"]) > 0]
william["hotel"]["val"] = [el for el in william["hotel"]["val"] if len(el["target"]) > 0]
william["hotel"]["test"] = [el for el in william["hotel"]["test"] if len(el["target"]) > 0]

In [24]:
tasks = {
    "single" : ['a', 'o'],
    "simple" : ["ao", "as"],
    "complex" : ["aos"]
}

In [25]:
combination_tasks = [
    tasks["simple"],
    tasks["complex"],
    tasks["single"] + tasks["simple"],
    tasks["single"] + tasks["complex"],
    tasks["simple"] + tasks["complex"],
    tasks["single"] + tasks["simple"] + tasks["complex"]
]

In [26]:
all_task = combination_tasks[-1]
print(all_task)

['a', 'o', 'ao', 'as', 'aos']


In [27]:
from copy import deepcopy

# William (AOS ID)
william_intermediate = dict()

for domain, v1 in william.items():
    william_intermediate[domain] = dict()
    for task in all_task:
        william_intermediate[domain][task] = dict()
        for split in v1.keys():
            ds = william[domain][split]
            ds_copy = deepcopy(ds)
            for i in range(len(ds_copy)):
                # Reduce
                ds_copy[i]["target"] = data_utils.reduce_targets(ds_copy[i]["target"],task)
                # Remove Duplicates
                ds_copy[i]["target"] = data_utils.remove_duplicate_targets(ds_copy[i]["target"])
            william_intermediate[domain][task][split] = ds_copy

In [28]:
mask = "<extra_id_X>"

In [29]:
def construct_answer(targets,se_order):
    if len(targets) == 0:
        return "NULL"
    result = []
    counter = 0
    for t in targets:
        constructed_t = ""
        for se in se_order:
            counter = counter % 100
            constructed_t += ' ' + mask.replace('X',str(counter)) + ' ' + t[data_utils.SENTIMENT_ELEMENT[se]]
            counter += 1
        constructed_t = constructed_t.strip()
        result.append(constructed_t)
    result = " ; ".join(result)
    return result

In [30]:
# def construct_prompt(text,se_order):
#     pattern = []
#     for counter, se in enumerate(se_order):
#         pattern.append(data_utils.SENTIMENT_ELEMENT[se] + " : " + mask.replace('X',str(counter)))
#     pattern = " ,".join(pattern)
#     prompt = f"Ekstrak dengan format >> {pattern} | "
#     # result = text + "| " + pattern
#     result = prompt + text
#     return result

In [31]:
import re

def catch_answer(output,se_order):
    if output == "NULL":
        return []
    output = output.replace("<pad>",'')
    output = output.replace("</s>",'')
    pattern = r""
    for se in se_order:
        if se != 's':
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT[se]}>[^;]+)\s*"
        else:
            pattern += f"<extra_id_\d+>\s*(?P<{data_utils.SENTIMENT_ELEMENT['s']}>positive|negative|neutral)\s*"
    found = [found_iter.groupdict() for found_iter in re.finditer(pattern,output)]
    for i in range(len(found)):
        for k, v in found[i].items():
            found[i][k] = found[i][k].strip()
    return found

In [32]:
from datasets import Dataset, DatasetDict

william_2 = dict()
for domain, v1 in william_intermediate.items():
    william_2[domain] = {
        "train" : [], # basic task
        "val" : [], # complex task
        "test" : [] # complex task
    }
    # TRAIN
    for basic_task in all_task:
        for el in william_intermediate[domain][basic_task]["train"]:
            william_2[domain]["train"].append({
                    "input" : el["text"],#construct_prompt(el["text"],basic_task),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # VAL
    for basic_task in all_task:
        for el in william_intermediate[domain]["aos"]["val"]:
            william_2[domain]["val"].append({
                    "input" : el["text"],#construct_prompt(el["text"],"aos"),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    # TEST
    for basic_task in all_task:
        for el in william_intermediate[domain]["aos"]["test"]:
            william_2[domain]["test"].append({
                    "input" : el["text"],#construct_prompt(el["text"],"aos"),
                    "output" : construct_answer(el["target"],basic_task),
                    "task" : basic_task
                })
    william_2[domain]["train"] = Dataset.from_list(william_2[domain]["train"])
    william_2[domain]["val"] = Dataset.from_list(william_2[domain]["val"])
    william_2[domain]["test"] = Dataset.from_list(william_2[domain]["test"])

In [33]:
william_2["hotel"] = DatasetDict(william_2["hotel"])

In [34]:
william_2["hotel"]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'task'],
        num_rows: 14180
    })
    val: Dataset({
        features: ['input', 'output', 'task'],
        num_rows: 4675
    })
    test: Dataset({
        features: ['input', 'output', 'task'],
        num_rows: 4840
    })
})

In [35]:
def create_entailment_dataset(row):
    return {
        "sentence_A" : row["input"],
        "sentence_B" : row["output"],
        "label" : row["task"]
    }

In [36]:
william_2["hotel"]["train"][0]

{'input': 'kamar saya ada kendala di ac tidak berfungsi optimal . dan juga wifi koneksi kurang stabil .',
 'output': '<extra_id_0> ac ; <extra_id_1> wifi koneksi',
 'task': 'a'}

In [37]:
william_2["hotel"] = william_2["hotel"].map(create_entailment_dataset,remove_columns=["input","output","task"])

                                                                    

In [38]:
william_2["hotel"]["val"][-1]

{'sentence_A': 'pelayanan yang baik .',
 'sentence_B': '<extra_id_0> pelayanan <extra_id_1> baik <extra_id_2> positive',
 'label': 'aos'}

In [47]:
train_sample = william_2["hotel"]["train"].shuffle()
val_sample = william_2["hotel"]["val"].shuffle()
test_sample = william_2["hotel"]["test"].shuffle()

In [48]:
train_sample = train_sample.select(range(500))
val_sample = val_sample.select(range(300))
test_sample = test_sample.select(range(200))

In [49]:
train_sample.to_csv("hotel_entailment_train_500.tsv",sep="\t")
val_sample.to_csv("hotel_entailment_val_300.tsv",sep="\t")
test_sample.to_csv("hotel_entailment_test_200.tsv",sep="\t")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 122.38ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 192.42ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 241.14ba/s]


44528

In [19]:
william_2["hotel"]["train"].to_csv("hotel_entailment_train.tsv",sep="\t")
william_2["hotel"]["val"].to_csv("hotel_entailment_val.tsv",sep="\t")
william_2["hotel"]["test"].to_csv("hotel_entailment_test.tsv",sep="\t")

Creating CSV from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 72.22ba/s]
Creating CSV from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 130.33ba/s]
Creating CSV from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 106.47ba/s]


999337

In [20]:
william_2["hotel"]["train"]

Dataset({
    features: ['sentence_A', 'sentence_B', 'label'],
    num_rows: 14180
})