### Get unique set of aspect categories

In [None]:
import csv

# Initialize a set to store unique categories
unique_categories = set()

# Read from existing CSV file
with open('absa_results.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        category = row['Category']
        unique_categories.add(category)

# Write unique categories to CSV
with open('unique_categories.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category'])
    for category in unique_categories:
        writer.writerow([category])

### Carry out testing on chatGPT generated sentences

In [None]:
import csv

# Function to append ABSA output to an existing CSV file
def append_to_csv(output_list, csv_filename):
    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Sentence', 'Aspect', 'Opinion', 'Sentiment', 'Category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write each row of output
        for row in output_list:
            writer.writerow(row)

# Load sample sentences from the text file
sample_sentences = []
with open('sample_data.txt', 'r', encoding='utf-8') as file:
    for line in file:
        sample_sentences.append(line.strip())
        
absa_output = []

# Iterate through sample sentences, perform predictions, and append results to CSV
for sentence in sample_sentences:
    # Perform ABSA prediction using your model
    result = generator.predict(sentence)
    # Replace the following code block with your ABSA model prediction code
    for quadruple in result['Quadruples']:
      aspect = quadruple['aspect']
      opinion = quadruple['opinion']
      sentiment = quadruple['polarity']
      category = quadruple['category']
      absa_output.append({'Sentence': sentence, 'Aspect': aspect, 'Opinion': opinion, 'Sentiment': sentiment, 'Category': category})
    
    # Append ABSA output to existing CSV file
    append_to_csv(absa_output, 'absa_results.csv')

{'text': 'The food at that restaurant was delicious, especially the pasta.', 'Quadruples': [{'aspect': 'food', 'polarity': 'positive', 'opinion': 'delicious', 'category': 'FOOD#QUALITY'}, {'aspect': 'pasta', 'polarity': 'positive', 'opinion': 'delicious', 'category': 'FOOD#QUALITY'}]}
{'text': 'The customer service was exceptional; the staff was very helpful and attentive.', 'Quadruples': [{'aspect': 'customer service', 'polarity': 'positive', 'opinion': 'exceptional', 'category': 'SUPPORT#GENERAL'}, {'aspect': 'staff', 'polarity': 'positive', 'opinion': 'helpful', 'category': 'SERVICE#GENERAL'}]}
{'text': 'I loved the ambiance of the cafe; it had a cozy atmosphere with soothing music.', 'Quadruples': [{'aspect': 'ambiance', 'polarity': 'positive', 'opinion': 'loved', 'category': 'AMBIENCE#GENERAL'}, {'aspect': 'atmosphere', 'polarity': 'positive', 'opinion': 'cozy', 'category': 'AMBIENCE#GENERAL'}, {'aspect': 'music', 'polarity': 'positive', 'opinion': 'soothing', 'category': 'AMBIENC

### Train model using own data

In [4]:
# -*- coding: utf-8 -*-
# file: train.py
# time: 11:30 2023/3/13
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
# github: https://github.com/yangheng95
# huggingface: https://huggingface.co/yangheng
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# Copyright (C) 2019-2023. All Rights Reserved.
import os
import warnings

import findfile
from pyabsa import ABSAInstruction as absa_instruction

warnings.filterwarnings("ignore")
import pandas as pd


task_name = "multitask"
experiment_name = "instruction"
# model_checkpoint = 'allenai/tk-instruct-base-def-pos'
# model_checkpoint = "kevinscaria/ate_tk-instruct-base-def-pos-neg-neut-combined"
# model_checkpoint = 'allenai/tk-instruct-large-def-pos'
# model_checkpoint = 'allenai/tk-instruct-3b-def-pos'
# model_checkpoint = 'google/mt5-base'
model_checkpoint = 'checkpoints\multitask\kevinscariaate_tk-instruct-base-def-pos-neg-neut-combined-instruction'

print("Experiment Name: ", experiment_name)
model_out_path = "checkpoints"
model_out_path = os.path.join(
    model_out_path, task_name, f"{model_checkpoint.replace('/', '')}-{experiment_name}"
)
print("Model output path: ", model_out_path)

# Load the data
# id_train_file_path = './integrated_datasets'
# id_test_file_path = './integrated_datasets'
# id_train_file_path = "./integrated_datasets/acos_datasets/"
# id_test_file_path = "./integrated_datasets/acos_datasets"
# id_train_file_path = "./integrated_datasets/acos_datasets/501.Laptop14"
# id_test_file_path = "./integrated_datasets/acos_datasets/501.Laptop14"
# id_train_file_path = './integrated_datasets/acos_datasets/504.Restaurant16'
# id_test_file_path = './integrated_datasets/acos_datasets/504.Restaurant16'
id_train_file_path = './data_annotation/combined'
id_test_file_path = './data_annotation/combined'


id_tr_df = absa_instruction.data_utils.read_json(id_train_file_path, "train")
id_te_df = absa_instruction.data_utils.read_json(id_test_file_path, "test")

id_tr_df = pd.DataFrame(id_tr_df)
id_te_df = pd.DataFrame(id_te_df)

loader = absa_instruction.data_utils.InstructDatasetLoader(id_tr_df, id_te_df)



if loader.train_df_id is not None:
    loader.train_df_id = loader.prepare_instruction_dataloader(loader.train_df_id)
if loader.test_df_id is not None:
    loader.test_df_id = loader.prepare_instruction_dataloader(loader.test_df_id)
if loader.train_df_ood is not None:
    loader.train_df_ood = loader.prepare_instruction_dataloader(loader.train_df_ood)
if loader.test_df_ood is not None:
    loader.test_df_ood = loader.prepare_instruction_dataloader(loader.test_df_ood)

# Create T5 utils object
t5_exp = absa_instruction.model.T5Generator(model_checkpoint)

# Tokenize Dataset
id_ds, id_tokenized_ds, ood_ds, ood_tokenzed_ds = loader.create_datasets(
    t5_exp.tokenize_function_inputs
)

# Training arguments
training_args = {
    "output_dir": model_out_path,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "learning_rate": 5e-5,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 16,
    "num_train_epochs": 20,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "load_best_model_at_end": True,
    "push_to_hub": False,
    "eval_accumulation_steps": 1,
    "predict_with_generate": True,
    "logging_steps": 1000000000,
    "use_mps_device": False,
    'fp16': False,
}

Experiment Name:  instruction
Model output path:  checkpoints\multitask\checkpoints\multitask\kevinscariaate_tk-instruct-base-def-pos-neg-neut-combined-instruction-instruction
./data_annotation/combined\combined_train.jsonl
./data_annotation/combined\combined_test.jsonl


Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

In [None]:
df = id_tr_df.sample(frac=1, random_state=1999)
for i, data in df.iterrows():
  try:
    for label in data["labels"]:
      print(label["aspect"])
  except:
    print("here", i)
    print(data["text"])
    print(data["labels"])
    print("HELP")
    break
  _aspects = [label["aspect"] for label in data["labels"]]

In [5]:
# Train model
model_trainer = t5_exp.train(id_tokenized_ds, **training_args)


# Get prediction labels - Training set
# id_tr_pred_labels = t5_exp.get_labels(
#     predictor=model_trainer,
#     tokenized_dataset=id_tokenized_ds,
#     sample_set="train",
#     batch_size=16,
# )
# id_tr_labels = [i.strip() for i in id_ds["train"]["labels"]]

# # Get prediction labels - Testing set
# id_te_pred_labels = t5_exp.get_labels(
#     predictor=model_trainer,
#     tokenized_dataset=id_tokenized_ds,
#     sample_set="test",
#     batch_size=16,
# )
# id_te_labels = [i.strip() for i in id_ds["test"]["labels"]]

# # Compute Metrics
# metrics = t5_exp.get_metrics(id_tr_labels, id_tr_pred_labels)
# print('----------------------- Training Set Metrics -----------------------')
# print(metrics)
#
# metrics = t5_exp.get_metrics(id_te_labels, id_te_pred_labels)
# print('----------------------- Testing Set Metrics -----------------------')
# print(metrics)

# Compute Metrics
# metrics = t5_exp.get_classic_metrics(id_tr_labels, id_tr_pred_labels)
# print("----------------------- Classic Training Set Metrics -----------------------")
# print(metrics)

# print("id_tr_labels", id_tr_labels)
# print("id_tr_pred_labels", id_tr_pred_labels)

# metrics = t5_exp.get_classic_metrics(id_te_labels, id_te_pred_labels)
# print("----------------------- Classic Testing Set Metrics -----------------------")
# print(metrics)

Trainer device: cuda:0

Model training started ....


  0%|          | 0/3480 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.45295706391334534, 'eval_runtime': 40.776, 'eval_samples_per_second': 8.534, 'eval_steps_per_second': 0.54, 'epoch': 1.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.38168343901634216, 'eval_runtime': 64.7715, 'eval_samples_per_second': 5.373, 'eval_steps_per_second': 0.34, 'epoch': 2.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3744215667247772, 'eval_runtime': 66.1658, 'eval_samples_per_second': 5.26, 'eval_steps_per_second': 0.332, 'epoch': 3.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3746013939380646, 'eval_runtime': 66.1374, 'eval_samples_per_second': 5.262, 'eval_steps_per_second': 0.333, 'epoch': 4.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3795979619026184, 'eval_runtime': 66.1407, 'eval_samples_per_second': 5.262, 'eval_steps_per_second': 0.333, 'epoch': 5.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.36377352476119995, 'eval_runtime': 66.076, 'eval_samples_per_second': 5.267, 'eval_steps_per_second': 0.333, 'epoch': 6.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.40426871180534363, 'eval_runtime': 66.0469, 'eval_samples_per_second': 5.269, 'eval_steps_per_second': 0.333, 'epoch': 7.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4194203317165375, 'eval_runtime': 66.2233, 'eval_samples_per_second': 5.255, 'eval_steps_per_second': 0.332, 'epoch': 8.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4108732342720032, 'eval_runtime': 66.0472, 'eval_samples_per_second': 5.269, 'eval_steps_per_second': 0.333, 'epoch': 9.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.41215622425079346, 'eval_runtime': 65.9973, 'eval_samples_per_second': 5.273, 'eval_steps_per_second': 0.333, 'epoch': 10.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4261099696159363, 'eval_runtime': 66.0922, 'eval_samples_per_second': 5.265, 'eval_steps_per_second': 0.333, 'epoch': 11.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4353024661540985, 'eval_runtime': 66.0784, 'eval_samples_per_second': 5.266, 'eval_steps_per_second': 0.333, 'epoch': 12.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4463439881801605, 'eval_runtime': 66.1276, 'eval_samples_per_second': 5.263, 'eval_steps_per_second': 0.333, 'epoch': 13.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.46307680010795593, 'eval_runtime': 66.0495, 'eval_samples_per_second': 5.269, 'eval_steps_per_second': 0.333, 'epoch': 14.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4721071422100067, 'eval_runtime': 65.9527, 'eval_samples_per_second': 5.277, 'eval_steps_per_second': 0.334, 'epoch': 15.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.47889775037765503, 'eval_runtime': 66.2186, 'eval_samples_per_second': 5.255, 'eval_steps_per_second': 0.332, 'epoch': 16.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.49793052673339844, 'eval_runtime': 66.0807, 'eval_samples_per_second': 5.266, 'eval_steps_per_second': 0.333, 'epoch': 17.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.49289318919181824, 'eval_runtime': 66.1526, 'eval_samples_per_second': 5.261, 'eval_steps_per_second': 0.333, 'epoch': 18.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4970405697822571, 'eval_runtime': 66.0538, 'eval_samples_per_second': 5.268, 'eval_steps_per_second': 0.333, 'epoch': 19.0}


  0%|          | 0/22 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4987224340438843, 'eval_runtime': 66.1511, 'eval_samples_per_second': 5.261, 'eval_steps_per_second': 0.333, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
Non-default generation parameters: {'max_length': 128}


{'train_runtime': 8714.5517, 'train_samples_per_second': 1.597, 'train_steps_per_second': 0.399, 'train_loss': 0.21618085839282508, 'epoch': 20.0}


In [None]:
from pyabsa import ABSAInstruction

if __name__ == "__main__":
    generator = ABSAInstruction.ABSAGenerator("checkpoints\multitask\kevinscariaate_tk-instruct-base-def-pos-neg-neut-combined-instruction")
    example = [
        "The food is good, but the service is bad.",
        "The chicken rice was cold but the noodles were great.",
        "Parking was hard to find",
        "The location of the restaurant is convenient.",
        "yum!"
    ]

    for example in example:
        result = generator.predict(example)
        print(result)

{'text': 'The food is good, but the service is bad.', 'Quadruples': [{'aspect': 'food', 'polarity': 'positive', 'opinion': 'good', 'category': 'FOOD#QUALITY'}, {'aspect': 'service', 'polarity': 'negative', 'opinion': 'bad', 'category': 'SERVICE#GENERAL'}]}
{'text': 'The food is good, but the service is bad.', 'Quadruples': [{'aspect': 'food', 'polarity': 'positive', 'opinion': 'good', 'category': 'FOOD#QUALITY'}, {'aspect': 'service', 'polarity': 'negative', 'opinion': 'bad', 'category': 'SERVICE#GENERAL'}]}
{'text': 'The chicken rice was cold but the noodles were great.', 'Quadruples': [{'aspect': 'chicken rice', 'polarity': 'negative', 'opinion': 'cold', 'category': 'FOOD#QUALITY'}, {'aspect': 'noodles', 'polarity': 'positive', 'opinion': 'great', 'category': 'FOOD#QUALITY'}]}
{'text': 'The chicken rice was cold but the noodles were great.', 'Quadruples': [{'aspect': 'chicken rice', 'polarity': 'negative', 'opinion': 'cold', 'category': 'FOOD#QUALITY'}, {'aspect': 'noodles', 'polarit

### Carry out micro and macro testing

In [8]:
import json
from pyabsa import ABSAInstruction

# Function to read data from JSONL file
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Function to write data to JSONL file
def write_jsonl(data, file_path):
    with open(file_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Function to predict using the model
def predict_texts(texts, model):
    predictions = []
    for text in texts:
        prediction = model.predict(text['text'])
        predictions.append(prediction)
    return predictions

# Paths to input JSONL files
input_files = ["combined_test.jsonl"]
# Paths to the output JSONL files
output_files = ["metric_test_set.jsonl"]

# Load the model
generator = ABSAInstruction.ABSAGenerator("checkpoints\multitask\kevinscariaate_tk-instruct-base-def-pos-neg-neut-combined-instruction")

for i, input_file in enumerate(input_files):
    # Read data from input file
    input_data = read_jsonl(input_file)

    # Predict using the model
    predicted_data = predict_texts(input_data, generator)

    # Write predicted data to output file
    write_jsonl(predicted_data, output_files[i])

{'text': 'Environment is nice  , same goes to the food. The only part can be improved us the elevated floor on the higher ground , as the set up my encourage kids to climb up and down. This can be unsafe for kids .', 'Quadruples': [{'aspect': 'Environment', 'polarity': 'positive', 'opinion': 'nice', 'category': 'AMBIENCE#GENERAL'}, {'aspect': 'food', 'polarity': 'negative', 'opinion': 'unsafe', 'category': 'FOOD#QUALITY'}, {'aspect': 'set up', 'polarity': 'negative', 'opinion': 'unsafe', 'category': 'AMBIENCE#GENERAL'}]}
{'text': 'After being down in Melaka for a while, then coming to KL and having a few disappointing meals it was heaven to have some familiar Melaka style (Muar is just near it) food.  Make sure you have the cendol for dessert- so good!', 'Quadruples': [{'aspect': 'cendol', 'polarity': 'positive', 'opinion': 'good', 'category': 'FOOD#QUALITY'}, {'aspect': 'NULL', 'polarity': 'positive', 'opinion': 'NULL', 'category': 'FOOD#QUALITY'}]}
{'text': 'Enjoyed our dinner here. 