In [2]:
import torch
from datasets import load_dataset
from datasets import concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
checkpoint = "yangheng/deberta-v3-base-absa"
MAX_LEN = 128

## Data loading

In [3]:
data = pd.read_csv('Data/data_grouped.csv')
data.head()

Unnamed: 0,Pre_Text,aspects
0,05 bar melted chocolate cup,food
1,07 menu deal try hip tribeca restaurant wrong ...,food
2,1 1 small piece,price
3,1 hour water 1 hour drinks 1 hour food see alm...,price
4,1 small piece,price


In [5]:
# Split the concatenated aspect names into a list and flatten it
aspect_names_list = [aspect.split(', ') for aspect in data['aspects']]
aspect_names_flat = [aspect for sublist in aspect_names_list for aspect in sublist]

# Get unique aspect names
aspects = set(aspect_names_flat)
print("Unique aspect names:", aspects)
print("Number of unique aspect names:", len(aspects))

Unique aspect names: {'staff', 'location', 'seating', 'drinks', 'decor', 'food', 'parking', 'place', 'menu', 'price', 'dessert', 'ambience', 'service', 'clean', 'views'}
Number of unique aspect names: 15


In [15]:
# check average length of the text
data['Pre_Text'].apply(lambda x: len(x.split())).mean()

14.209853197312764

In [6]:
class2id = {class_:id for id, class_ in enumerate(aspects)}
id2class = {id:class_ for class_, id in class2id.items()}

In [7]:
print("class2id:", class2id)
print("id2class:", id2class)

class2id: {'staff': 0, 'location': 1, 'seating': 2, 'drinks': 3, 'decor': 4, 'food': 5, 'parking': 6, 'place': 7, 'menu': 8, 'price': 9, 'dessert': 10, 'ambience': 11, 'service': 12, 'clean': 13, 'views': 14}
id2class: {0: 'staff', 1: 'location', 2: 'seating', 3: 'drinks', 4: 'decor', 5: 'food', 6: 'parking', 7: 'place', 8: 'menu', 9: 'price', 10: 'dessert', 11: 'ambience', 12: 'service', 13: 'clean', 14: 'views'}


In [8]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [17]:
def preprocess_function(example):
   text = example['Pre_Text']
   all_labels = example['aspects'].split(', ')
   labels = [0. for i in range(len(aspects))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.
  
   example = tokenizer(text, truncation=True, max_length=MAX_LEN, padding='max_length')
   example['labels'] = labels
   return example

In [18]:
dataset = load_dataset('csv', data_files='Data/data_grouped.csv', split='train')
dataset = dataset.train_test_split(test_size=0.2)

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Pre_Text', 'aspects'],
        num_rows: 48228
    })
    test: Dataset({
        features: ['Pre_Text', 'aspects'],
        num_rows: 12057
    })
})

In [20]:
tokenized_dataset = dataset.map(preprocess_function)

Map: 100%|██████████| 48228/48228 [00:13<00:00, 3461.67 examples/s]
Map: 100%|██████████| 12057/12057 [00:03<00:00, 3076.32 examples/s]


In [21]:


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [22]:


clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [23]:
model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint, num_labels=len(aspects),
        id2label=id2class, label2id=class2id,
        problem_type = "multi_label_classification")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at yangheng/deberta-v3-base-absa and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
print(model.config)

DebertaV2Config {
  "_name_or_path": "yangheng/deberta-v3-base-absa",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "staff",
    "1": "location",
    "2": "seating",
    "3": "drinks",
    "4": "decor",
    "5": "food",
    "6": "parking",
    "7": "place",
    "8": "menu",
    "9": "price",
    "10": "dessert",
    "11": "ambience",
    "12": "service",
    "13": "clean",
    "14": "views"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "ambience": 11,
    "clean": 13,
    "decor": 4,
    "dessert": 10,
    "drinks": 3,
    "food": 5,
    "location": 1,
    "menu": 8,
    "parking": 6,
    "place": 7,
    "price": 9,
    "seating": 2,
    "service": 12,
    "staff": 0,
    "views": 14
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention

In [None]:
batch_size = 2
epochs = 3
exp = "1"
step = int(len(tokenized_data["train"])/(batch_size*5)) # log after every 20% of the epoch

print(step)

training_args = TrainingArguments(
    output_dir="./trained/absa-base"+"_exp"+exp,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    save_total_limit=3,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=step,
    save_strategy="steps",
    save_steps=step,
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    logging_dir="./logs/absa-base"+"_exp"+exp,
    logging_strategy = "steps",
    logging_steps = step
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_metrics,
)

In [None]:
trainer.train()

## Evaluation

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import evaluate
import numpy as np
import pandas as pd

In [None]:
path = "./trained/absa-base_exp3/checkpoint-49620"
model = AutoModelForSequenceClassification.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)

In [None]:
from tqdm.notebook import tqdm
import evaluate
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
metric = evaluate.load("glue", "sst2", device=device)
# import precision_recall_fscore_support
def evaluate_test_set(model, tokenizer, test_dataset, batch_size=32, pbar = False):
    if torch.cuda.is_available():
        device = "cuda"
    else :
        device = "cpu"
    print(f"Device: {device}")
    model.to(device)
    model.eval()
    acc = 0
    f1 = 0
    recall = 0
    precision = 0

    predictions = []
    labels = []

    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
    if pbar:
        progress = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
    for idx, batch in enumerate(test_dataloader):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        scores = outputs.logits
        probs = scores.softmax(dim=1)
        predicted_class_ids = probs.argmax(dim=1).tolist()

        predictions.extend(predicted_class_ids)
        batch_labels = batch["label"]
        labels.extend(batch_labels)

        # batch_accuracy = sum([1 if label == prediction else 0 for label, prediction in zip(batch_labels, predicted_class_ids)])
        # batch_accuracy = batch_accuracy / len(batch_labels)

        batch_metric = metric.compute(predictions=predicted_class_ids, references=batch_labels)
        batch_acc = batch_metric["accuracy"]
        batch_f1 = f1_score(batch_labels, predicted_class_ids, average='weighted', labels=np.unique(predicted_class_ids))
        batch_recall = recall_score(batch_labels, predicted_class_ids, average='weighted', labels=np.unique(predicted_class_ids))
        batch_precision = precision_score(batch_labels, predicted_class_ids, average='weighted', labels=np.unique(predicted_class_ids))
        # print(batch_accuracy, batch_f1, batch_recall, batch_precision)
    
        acc += batch_metric["accuracy"]
        f1 += batch_f1
        recall += batch_recall
        precision += batch_precision

        if pbar:
            progress.update(1)
            progress.set_description(f"Batch {idx+1}/{len(test_dataloader)}: {batch_acc*100:.2f}%")
        # print(f"Batch {idx+1}/{len(test_dataloader)}: {batch_accuracy}")


    acc = acc / len(test_dataloader)
    f1 = f1 / len(test_dataloader)
    recall = recall / len(test_dataloader)
    precision = precision / len(test_dataloader)
    return acc, f1, recall, precision, predictions, labels



In [None]:
accuracy, f1, recall, precision, predictions, labels =  evaluate_test_set(model, tokenizer, data["test"], batch_size=32, pbar=True)

In [None]:
print(f"Accuracy: {accuracy*100:.2f}%, F1: {f1*100:.2f}%, Recall: {recall*100:.2f}%, Precision: {precision*100:.2f}%")

In [None]:
from sklearn.metrics import classification_report
print(classification_report(labels, predictions, target_names=aspects, output_dict=True))