# FLAN-T5 rest14 ABSA (Aspect + Polarity)

In [1]:
#%pip install --upgrade pip

In [2]:
#%pip install transformers

In [3]:
#%pip install accelerate -U

In [4]:
#%pip install datasets

In [5]:
#%pip install git+https://github.com/dask/s3fs

In [6]:
#%pip install wandb

In [7]:
#%pip install torch

In [8]:
import pandas as pd
import os
import warnings
from datasets import load_dataset
import json
import random
from transformers import AutoTokenizer
from datasets import concatenate_datasets
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import wandb


warnings.filterwarnings("ignore")



In [9]:
raw_datasets = load_dataset("alexcadillon/SemEval2014Task4", "restaurants")
raw_datasets

DatasetDict({
    trial: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 100
    })
    train: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 800
    })
})

In [10]:
def preprocess_for_prompts(sample):
    with open('prompts_absa.json') as fp:
        template = json.load(fp)

    num = random.randint(0, len(template)-1)
    instruction =  template[str(num)]

    sample["aspect_polarities_list"] = ",".join([f"{item['term']}: {item['polarity']}" for item in sample['aspectTerms']])
    sample['aspect_polarities_output'] = f"Answer: \n{sample['aspect_polarities_list']}"
    sample["aspect_polarity_input"] = (
        f"{instruction}\n{sample['text']}\n"
    )
    return sample

In [11]:
augmented_dataset = raw_datasets.map(preprocess_for_prompts)
augmented_dataset['train'][10]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'sentenceId': '296',
 'text': 'They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.',
 'aspectTerms': [{'term': 'toast',
   'polarity': 'negative',
   'from': '41',
   'to': '46'},
  {'term': 'mayonnaise', 'polarity': 'negative', 'from': '18', 'to': '28'},
  {'term': 'bacon', 'polarity': 'negative', 'from': '126', 'to': '131'},
  {'term': 'cheese', 'polarity': 'neutral', 'from': '73', 'to': '79'},
  {'term': 'ingredients', 'polarity': 'negative', 'from': '57', 'to': '68'},
  {'term': 'plate', 'polarity': 'neutral', 'from': '170', 'to': '175'},
  {'term': 'omelet', 'polarity': 'neutral', 'from': '86', 'to': '92'}],
 'aspectCategories': [{'category': 'food', 'polarity': 'negative'}],
 'aspect_polarities_list': 'toast: negative,mayonnaise: negative,bacon: negative,cheese: neutral,ingredients: negative,plate: neutral,omelet: neutral',
 'aspect_pol

In [12]:
model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [13]:
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets(
    [augmented_dataset["train"], augmented_dataset["test"], augmented_dataset["trial"]],
).map(
    lambda x: tokenizer(x["aspect_polarity_input"], truncation=True),
    batched=True,
    remove_columns=[
        "sentenceId",
        "text",
        "aspectTerms",
        "aspectCategories",
        "aspect_polarities_list",
        "aspect_polarities_output",
        "aspect_polarity_input",
    ],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets(
    [augmented_dataset["train"], augmented_dataset["test"], augmented_dataset["trial"]]
).map(
    lambda x: tokenizer(x["aspect_polarities_output"], truncation=True),
    batched=True,
    remove_columns=[
        "sentenceId",
        "text",
        "aspectTerms",
        "aspectCategories",
        "aspect_polarities_list",
        "aspect_polarities_output",
        "aspect_polarity_input",
    ],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/3941 [00:00<?, ? examples/s]

Max source length: 105


Map:   0%|          | 0/3941 [00:00<?, ? examples/s]

Max target length: 88


In [14]:
def preprocess_function(sample, padding="max_length"):

    model_inputs = tokenizer(
        sample["aspect_polarity_input"],
        max_length=max_source_length,
        padding=padding,
        truncation=True,
    )

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["aspect_polarities_output"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = augmented_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=[
        "sentenceId",
        "text",
        "aspectTerms",
        "aspectCategories",
        "aspect_polarities_list",
        "aspect_polarities_output",
        "aspect_polarity_input",
    ],
)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/3041 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [15]:
# huggingface hub model id
model_id = "google/flan-t5-large"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

In [16]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

In [17]:
from huggingface_hub import HfFolder
HfFolder.save_token(os.environ['hugging_face_login'])

In [18]:
os.environ["WANDB_PROJECT"] = "absa_research2"
os.environ["WANDB_LOG_MODEL"] = "true"
wandb.login(key=os.environ["wandb_login"])

[34m[1mwandb[0m: Currently logged in as: [33mkatya_shakhova[0m ([33mshakhova[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc


True

In [19]:
# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-absa-rest"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    fp16=False,  # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    warmup_ratio=0.1,
    weight_decay=0.1,
    optim="adamw_torch",
    # logging & evaluation strategies
    evaluation_strategy="steps",
    eval_steps=100,
    report_to="wandb",
    logging_steps=100,
    save_strategy="no",
    save_total_limit=1,
    push_to_hub=False,
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],

)

In [20]:
trainer.train()

[34m[1mwandb[0m: Tracking run with wandb version 0.16.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/jupyter/work/resources/wandb/run-20240313_132128-6frvlpzo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msunny-terrain-15[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/shakhova/absa_research2[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/shakhova/absa_research2/runs/6frvlpzo[0m


Step,Training Loss,Validation Loss
100,1.8172,0.252466
200,0.3471,0.1653
300,0.2487,0.157076
400,0.2064,0.145994
500,0.1673,0.142941
600,0.1569,0.139175
700,0.1291,0.151838
800,0.1268,0.146103
900,0.1091,0.144477


OSError: [Errno 28] No space left on device

In [None]:
# Save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

In [None]:
wandb.finish()