### Test model on SemEval 2016 dataset

In [8]:
from pyabsa import ABSAInstruction
import xml.etree.ElementTree as ET
import csv

# Parse XML file
tree = ET.parse('semeval_16.xml')
root = tree.getroot()

generator = ABSAInstruction.ABSAGenerator("multilingual")

# Open CSV file for writing results
with open('absa_results.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Sentence', 'Aspect', 'Opinion', 'Sentiment', 'Category'])

    # Iterate through each review
    for review in root.findall('./Review'):
        for sentence in review.findall('./sentences/sentence'):
            sentence_text = sentence.find('text').text.strip()
            
            print(sentence_text)
            
            # Perform ABSA
            result = generator.predict(sentence_text)
            
            # Write ABSA results to CSV
            for quadruple in result['Quadruples']:
                aspect = quadruple['aspect']
                opinion = quadruple['opinion']
                sentiment = quadruple['polarity']
                category = quadruple['category']
                writer.writerow([sentence_text, aspect, opinion, sentiment, category])

FileNotFoundError: [Errno 2] No such file or directory: 'semeval_16.xml'

### Get unique set of aspect categories

In [8]:
import csv

# Initialize a set to store unique categories
unique_categories = set()

# Read from existing CSV file
with open('absa_results.csv', 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        category = row['Category']
        unique_categories.add(category)

# Write unique categories to CSV
with open('unique_categories.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category'])
    for category in unique_categories:
        writer.writerow([category])

### Carry out testing on chatGPT generated sentences

In [13]:
import csv

# Function to append ABSA output to an existing CSV file
def append_to_csv(output_list, csv_filename):
    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Sentence', 'Aspect', 'Opinion', 'Sentiment', 'Category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write each row of output
        for row in output_list:
            writer.writerow(row)

# Load sample sentences from the text file
sample_sentences = []
with open('sample_data.txt', 'r', encoding='utf-8') as file:
    for line in file:
        sample_sentences.append(line.strip())
        
absa_output = []

# Iterate through sample sentences, perform predictions, and append results to CSV
for sentence in sample_sentences:
    # Perform ABSA prediction using your model
    result = generator.predict(sentence)
    # Replace the following code block with your ABSA model prediction code
    for quadruple in result['Quadruples']:
      aspect = quadruple['aspect']
      opinion = quadruple['opinion']
      sentiment = quadruple['polarity']
      category = quadruple['category']
      absa_output.append({'Sentence': sentence, 'Aspect': aspect, 'Opinion': opinion, 'Sentiment': sentiment, 'Category': category})
    
    # Append ABSA output to existing CSV file
    append_to_csv(absa_output, 'absa_results.csv')

{'text': 'The food at that restaurant was delicious, especially the pasta.', 'Quadruples': [{'aspect': 'food', 'polarity': 'positive', 'opinion': 'delicious', 'category': 'FOOD#QUALITY'}, {'aspect': 'pasta', 'polarity': 'positive', 'opinion': 'delicious', 'category': 'FOOD#QUALITY'}]}
{'text': 'The customer service was exceptional; the staff was very helpful and attentive.', 'Quadruples': [{'aspect': 'customer service', 'polarity': 'positive', 'opinion': 'exceptional', 'category': 'SUPPORT#GENERAL'}, {'aspect': 'staff', 'polarity': 'positive', 'opinion': 'helpful', 'category': 'SERVICE#GENERAL'}]}
{'text': 'I loved the ambiance of the cafe; it had a cozy atmosphere with soothing music.', 'Quadruples': [{'aspect': 'ambiance', 'polarity': 'positive', 'opinion': 'loved', 'category': 'AMBIENCE#GENERAL'}, {'aspect': 'atmosphere', 'polarity': 'positive', 'opinion': 'cozy', 'category': 'AMBIENCE#GENERAL'}, {'aspect': 'music', 'polarity': 'positive', 'opinion': 'soothing', 'category': 'AMBIENC

### Train model using own data

In [14]:
# -*- coding: utf-8 -*-
# file: train.py
# time: 11:30 2023/3/13
# author: YANG, HENG <hy345@exeter.ac.uk> (杨恒)
# github: https://github.com/yangheng95
# huggingface: https://huggingface.co/yangheng
# google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en
# Copyright (C) 2019-2023. All Rights Reserved.
import os
import warnings

import findfile
from pyabsa import ABSAInstruction as absa_instruction

warnings.filterwarnings("ignore")
import pandas as pd


task_name = "multitask"
experiment_name = "instruction"
# model_checkpoint = 'allenai/tk-instruct-base-def-pos'
model_checkpoint = "kevinscaria/ate_tk-instruct-base-def-pos-neg-neut-combined"
# model_checkpoint = 'allenai/tk-instruct-large-def-pos'
# model_checkpoint = 'allenai/tk-instruct-3b-def-pos'
# model_checkpoint = 'google/mt5-base'

print("Experiment Name: ", experiment_name)
model_out_path = "checkpoints"
model_out_path = os.path.join(
    model_out_path, task_name, f"{model_checkpoint.replace('/', '')}-{experiment_name}"
)
print("Model output path: ", model_out_path)

# Load the data
# id_train_file_path = './integrated_datasets'
# id_test_file_path = './integrated_datasets'
# id_train_file_path = "./integrated_datasets/acos_datasets/"
# id_test_file_path = "./integrated_datasets/acos_datasets"
# id_train_file_path = "./integrated_datasets/acos_datasets/501.Laptop14"
# id_test_file_path = "./integrated_datasets/acos_datasets/501.Laptop14"
id_train_file_path = './integrated_datasets/acos_datasets/504.Restaurant16'
id_test_file_path = './integrated_datasets/acos_datasets/504.Restaurant16'


id_tr_df = absa_instruction.data_utils.read_json(id_train_file_path, "train")
id_te_df = absa_instruction.data_utils.read_json(id_test_file_path, "test")

id_tr_df = pd.DataFrame(id_tr_df)
id_te_df = pd.DataFrame(id_te_df)

loader = absa_instruction.data_utils.InstructDatasetLoader(id_tr_df, id_te_df)

if loader.train_df_id is not None:
    loader.train_df_id = loader.prepare_instruction_dataloader(loader.train_df_id)
if loader.test_df_id is not None:
    loader.test_df_id = loader.prepare_instruction_dataloader(loader.test_df_id)
if loader.train_df_ood is not None:
    loader.train_df_ood = loader.prepare_instruction_dataloader(loader.train_df_ood)
if loader.test_df_ood is not None:
    loader.test_df_ood = loader.prepare_instruction_dataloader(loader.test_df_ood)

# Create T5 utils object
t5_exp = absa_instruction.model.T5Generator(model_checkpoint)

# Tokenize Dataset
id_ds, id_tokenized_ds, ood_ds, ood_tokenzed_ds = loader.create_datasets(
    t5_exp.tokenize_function_inputs
)

# Training arguments
training_args = {
    "output_dir": model_out_path,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "learning_rate": 5e-5,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 16,
    "num_train_epochs": 6,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "load_best_model_at_end": True,
    "push_to_hub": False,
    "eval_accumulation_steps": 1,
    "predict_with_generate": True,
    "logging_steps": 1000000000,
    "use_mps_device": False,
    # 'fp16': True,
    "fp16": False,
}

Experiment Name:  instruction
Model output path:  checkpoints\multitask\kevinscariaate_tk-instruct-base-def-pos-neg-neut-combined-instruction
./integrated_datasets/acos_datasets/504.Restaurant16\rest16_quad_train.tsv.jsonl
./integrated_datasets/acos_datasets/504.Restaurant16\rest16_quad_test.tsv.jsonl
[2024-04-10 12:23:29] (2.4.1.post1) ********** [32mAvailable ACOS model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-04-10 12:23:29] (2.4.1.post1) ********** [32mAvailable ACOS model checkpoints for Version:2.4.1.post1 (this version)[0m **********
[2024-04-10 12:23:29] (2.4.1.post1) [31mCheckpoint:kevinscaria/ate_tk-instruct-base-def-pos-neg-neut-combined is not found, you can raise an issue for requesting shares of checkpoints[0m
[2024-04-10 12:23:29] (2.4.1.post1) No checkpoint found in Model Hub for task: kevinscaria/ate_tk-instruct-base-def-pos-neg-neut-combined


Map:   0%|          | 0/5601 [00:00<?, ? examples/s]

Map:   0%|          | 0/2135 [00:00<?, ? examples/s]

In [18]:
id_tr_df

Unnamed: 0,text,labels
0,judging from previous posts this used to be a ...,"[{'aspect': 'place', 'opinion': 'not any longe..."
1,"we , there were four of us , arrived at noon -...","[{'aspect': 'staff', 'opinion': 'rude', 'polar..."
2,"they never brought us complimentary noodles , ...","[{'aspect': 'NULL', 'opinion': 'NULL', 'polari..."
3,the food was lousy - too sweet or too salty an...,"[{'aspect': 'food', 'opinion': 'lousy', 'polar..."
4,"after all that , they complained to me about t...","[{'aspect': 'NULL', 'opinion': 'complained', '..."
...,...,...
1525,"i ca n ' t believe that it was , but please pu...","[{'aspect': 'NULL', 'opinion': 'NULL', 'polari..."
1526,the waitress came to check in on us every few ...,"[{'aspect': 'waitress', 'opinion': 'NULL', 'po..."
1527,i could n ' t ignore the fact that she reach o...,"[{'aspect': 'NULL', 'opinion': 'NULL', 'polari..."
1528,she then put the check down without asking if ...,"[{'aspect': 'NULL', 'opinion': 'NULL', 'polari..."


In [3]:
# Train model
model_trainer = t5_exp.train(id_tokenized_ds, **training_args)


# Get prediction labels - Training set
id_tr_pred_labels = t5_exp.get_labels(
    predictor=model_trainer,
    tokenized_dataset=id_tokenized_ds,
    sample_set="train",
    batch_size=16,
)
id_tr_labels = [i.strip() for i in id_ds["train"]["labels"]]

# Get prediction labels - Testing set
id_te_pred_labels = t5_exp.get_labels(
    predictor=model_trainer,
    tokenized_dataset=id_tokenized_ds,
    sample_set="test",
    batch_size=16,
)
id_te_labels = [i.strip() for i in id_ds["test"]["labels"]]

# # Compute Metrics
# metrics = t5_exp.get_metrics(id_tr_labels, id_tr_pred_labels)
# print('----------------------- Training Set Metrics -----------------------')
# print(metrics)
#
# metrics = t5_exp.get_metrics(id_te_labels, id_te_pred_labels)
# print('----------------------- Testing Set Metrics -----------------------')
# print(metrics)

# Compute Metrics
metrics = t5_exp.get_classic_metrics(id_tr_labels, id_tr_pred_labels)
print("----------------------- Classic Training Set Metrics -----------------------")
print(metrics)

metrics = t5_exp.get_classic_metrics(id_te_labels, id_te_pred_labels)
print("----------------------- Classic Testing Set Metrics -----------------------")
print(metrics)

Trainer device: cuda:0

Model training started ....


  0%|          | 0/8406 [00:00<?, ?it/s]

  0%|          | 0/134 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2614549398422241, 'eval_runtime': 21.7323, 'eval_samples_per_second': 98.241, 'eval_steps_per_second': 6.166, 'epoch': 1.0}


  0%|          | 0/134 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2135722041130066, 'eval_runtime': 22.0342, 'eval_samples_per_second': 96.895, 'eval_steps_per_second': 6.081, 'epoch': 2.0}


  0%|          | 0/134 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.20946869254112244, 'eval_runtime': 21.2735, 'eval_samples_per_second': 100.36, 'eval_steps_per_second': 6.299, 'epoch': 3.0}


  0%|          | 0/134 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.20837971568107605, 'eval_runtime': 21.1635, 'eval_samples_per_second': 100.881, 'eval_steps_per_second': 6.332, 'epoch': 4.0}


  0%|          | 0/134 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2043803483247757, 'eval_runtime': 21.1642, 'eval_samples_per_second': 100.878, 'eval_steps_per_second': 6.331, 'epoch': 5.0}


  0%|          | 0/134 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.21997113525867462, 'eval_runtime': 21.0217, 'eval_samples_per_second': 101.562, 'eval_steps_per_second': 6.374, 'epoch': 6.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
Non-default generation parameters: {'max_length': 128}


{'train_runtime': 1725.1374, 'train_samples_per_second': 19.48, 'train_steps_per_second': 4.873, 'train_loss': 0.24677734955872294, 'epoch': 6.0}
Prediction from trainer


  0%|          | 0/351 [00:00<?, ?it/s]

Prediction from trainer


  0%|          | 0/134 [00:00<?, ?it/s]

NULL
NULL
NULL:negative
NULL:negative
NULL:overpriced
NULL:overpriced
NULL
NULL
NULL:positive
NULL:positive
NULL:conveniently
NULL:conveniently
drinkmenu
drinkmenu
drinkmenu:positive
drinkmenu:positive
drinkmenu:love
drinkmenu:love
drinkmenu:DRINKS#STYLE_OPTIONS
drinkmenu:DRINKS#STYLE_OPTIONS
food|decor
food|decor
food:positive|decor:neutral
food:positive|decor:negative
food:great|food:great|decor:NULL
food:great|decor:NULL
food:FOOD#QUALITY|food:FOOD#PRICES|decor:AMBIENCE#GENERAL
food:FOOD#QUALITY|decor:AMBIENCE#GENERAL
dessert|specialroll|regularroll
specialroll|regularroll
dessert:positive|specialroll:positive|regularroll:positive
dessert:positive|specialroll:positive|regularroll:positive
dessert:saveroom|specialroll:enough|regularroll:enough
dessert:NULL|specialroll:NULL|regularroll:NULL
dessert:FOOD#QUALITY|specialroll:FOOD#STYLE_OPTIONS|regularroll:FOOD#STYLE_OPTIONS
dessert:FOOD#QUALITY|specialroll:FOOD#STYLE_OPTIONS|regularroll:FOOD#STYLE_OPTIONS
NULL
NULL
NULL:negative
NULL:ne

In [19]:
from pyabsa import ABSAInstruction

if __name__ == "__main__":
    generator = ABSAInstruction.ABSAGenerator("checkpoints\multitask\kevinscariaate_tk-instruct-base-def-pos-neg-neut-combined-instruction")
    example = [
        "The food is good, but the service is bad.",
        "The chicken rice was cold but the noodles were great.",
        "Parking was hard to find",
        "The location of the restaurant is convenient.",
        "yum!"
    ]

    for example in example:
        result = generator.predict(example)
        print(result)

{'text': 'The food is good, but the service is bad.', 'Quadruples': [{'aspect': 'food', 'polarity': 'positive', 'opinion': 'good', 'category': 'FOOD#QUALITY'}, {'aspect': 'service', 'polarity': 'negative', 'opinion': 'bad', 'category': 'SERVICE#GENERAL'}]}
{'text': 'The food is good, but the service is bad.', 'Quadruples': [{'aspect': 'food', 'polarity': 'positive', 'opinion': 'good', 'category': 'FOOD#QUALITY'}, {'aspect': 'service', 'polarity': 'negative', 'opinion': 'bad', 'category': 'SERVICE#GENERAL'}]}
{'text': 'The chicken rice was cold but the noodles were great.', 'Quadruples': [{'aspect': 'chicken rice', 'polarity': 'negative', 'opinion': 'cold', 'category': 'FOOD#QUALITY'}, {'aspect': 'noodles', 'polarity': 'positive', 'opinion': 'great', 'category': 'FOOD#QUALITY'}]}
{'text': 'The chicken rice was cold but the noodles were great.', 'Quadruples': [{'aspect': 'chicken rice', 'polarity': 'negative', 'opinion': 'cold', 'category': 'FOOD#QUALITY'}, {'aspect': 'noodles', 'polarit