# Задача нахождения категории аспекта и его тональности как многомерная классфикация предложения

В этом ноутбуке я рассмотрю нахождение категории авспекта/аспектов и их тональности через задачу Seq2Seq классификации. Так у одного предложения может быть несколько категорий, но лейблы будет соединением категории и тональности, таким образом один конкретный лейбл может быть только один раз в векторе возможных лейблов.

In [1]:
#!g2.1
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
#!g2.1
from datasets import load_dataset
raw_datasets = load_dataset("alexcadillon/SemEval2014Task4", 'restaurants')
raw_datasets

Downloading builder script:   0%|          | 0.00/10.0k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/359k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating trial split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    trial: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 100
    })
    train: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 800
    })
})

In [3]:
#!g2.1
categories = []
for i in raw_datasets['train']['aspectCategories']:
    if i[0]['category'] not in categories:
        categories.append(i[0]['category'])
categories    

['service', 'food', 'anecdotes/miscellaneous', 'ambience', 'price']

In [4]:
#!g2.1
labels = ['service_positive', 'service_negative', 'service_neutral', 'service_conflict',
          'food_positive', 'food_negative', 'food_neutral', 'food_conflict',
          'anecdotes/miscellaneous_positive', 'anecdotes/miscellaneous_negative', 'anecdotes/miscellaneous_neutral', 'anecdotes/miscellaneous_conflict',
          'ambience_positive', 'ambience_negative', 'ambience_neutral', 'ambience_conflict',
          'price_positive','price_negative' , 'price_neutral', 'price_conflict']

In [5]:
#!g2.1
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
label2id

{'service_positive': 0,
 'service_negative': 1,
 'service_neutral': 2,
 'service_conflict': 3,
 'food_positive': 4,
 'food_negative': 5,
 'food_neutral': 6,
 'food_conflict': 7,
 'anecdotes/miscellaneous_positive': 8,
 'anecdotes/miscellaneous_negative': 9,
 'anecdotes/miscellaneous_neutral': 10,
 'anecdotes/miscellaneous_conflict': 11,
 'ambience_positive': 12,
 'ambience_negative': 13,
 'ambience_neutral': 14,
 'ambience_conflict': 15,
 'price_positive': 16,
 'price_negative': 17,
 'price_neutral': 18,
 'price_conflict': 19}

In [6]:
#!g2.1
sample = raw_datasets['train'][5]
sample

{'sentenceId': '2846',
 'text': "Not only was the food outstanding, but the little 'perks' were great.",
 'aspectTerms': [{'term': 'food',
   'polarity': 'positive',
   'from': '17',
   'to': '21'},
  {'term': 'perks', 'polarity': 'positive', 'from': '51', 'to': '56'}],
 'aspectCategories': [{'category': 'food', 'polarity': 'positive'},
  {'category': 'service', 'polarity': 'positive'}]}

In [7]:
#!g2.1
def combo_label(example):   
    labels_combo = []
    for item in example['aspectCategories']:
        labels_combo.append(f"{item['category']}_{item['polarity']}")
    example['labels_combo'] = labels_combo
    return example

In [8]:
#!g2.1
dataset = raw_datasets.map(combo_label)
dataset



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/3041 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

DatasetDict({
    trial: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories', 'labels_combo'],
        num_rows: 100
    })
    train: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories', 'labels_combo'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories', 'labels_combo'],
        num_rows: 800
    })
})

In [9]:
#!g2.1
sample = dataset['train'][0]
sample

{'sentenceId': '3121',
 'text': 'But the staff was so horrible to us.',
 'aspectTerms': [{'term': 'staff',
   'polarity': 'negative',
   'from': '8',
   'to': '13'}],
 'aspectCategories': [{'category': 'service', 'polarity': 'negative'}],
 'labels_combo': ['service_negative']}

In [10]:
#!g2.1
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("albert-xxlarge-v2")
label_counts = len(labels)

def preprocess_data(example):
  # take a batch of texts
  text = example["text"]
  # encode them
  encoding = tokenizer(text, add_special_tokens=True)
  # add labels
  label_ids = [0 for i in range(label_counts)]
  for item in example['labels_combo']:
            new = [1.0 if l == labels.index(item) else 0.0 for l in range(label_counts)]
            label_ids = [x+y for x,y in zip(label_ids, new)]
            new = [0 for i in range(label_counts)]

  encoding["labels"] = label_ids

  return encoding

config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [11]:
#!g2.1
preprocess_data(sample)

{'input_ids': [2, 47, 14, 1138, 23, 86, 9244, 20, 182, 9, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [12]:
#!g2.1
tokenized_dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names)
tokenized_dataset

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/3041 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

DatasetDict({
    trial: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
})

In [13]:
#!g2.1
raw_datasets['train'][15]

{'sentenceId': '3359',
 'text': 'The pizza is the best if you like thin crusted pizza.',
 'aspectTerms': [{'term': 'pizza',
   'polarity': 'positive',
   'from': '4',
   'to': '9'},
  {'term': 'thin crusted pizza',
   'polarity': 'neutral',
   'from': '34',
   'to': '52'}],
 'aspectCategories': [{'category': 'food', 'polarity': 'positive'}]}

In [14]:
#!g2.1
example = tokenized_dataset['train'][15]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [15]:
#!g2.1
tokenizer.decode(example['input_ids'])



'[CLS] the pizza is the best if you like thin crusted pizza.[SEP]'

In [16]:
#!g2.1
example['labels']

[0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [17]:
#!g2.1
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['food_positive']

In [18]:
#!g2.1
tokenized_dataset.set_format("torch")

In [19]:
#!g2.1
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("albert-xxlarge-v2", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors:   0%|          | 0.00/893M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xxlarge-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
#!g2.1
batch_size = 8

In [21]:
#!g2.1
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = True)

In [22]:
#!g2.1
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    #push_to_hub=True,
)

In [23]:
#!g2.1
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [24]:
#!g2.1
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.10334,0.301529,0.639495,0.56
2,0.140800,0.070195,0.451156,0.708728,0.68875
3,0.065100,0.068412,0.535523,0.748901,0.70875
4,0.036600,0.0655,0.558758,0.760101,0.73875
5,0.036600,0.065141,0.559589,0.760702,0.75


TrainOutput(global_step=1905, training_loss=0.06798371542782922, metrics={'train_runtime': 614.0828, 'train_samples_per_second': 24.761, 'train_steps_per_second': 3.102, 'total_flos': 702077429356296.0, 'train_loss': 0.06798371542782922, 'epoch': 5.0})

# Финальные выводы

Результаты получились довольно посредственные, так как я ничего не делала с дисбалансом классов, особо низкая метрика F1