In [1]:
#!g2.1
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
#!g2.1
from datasets import load_dataset
raw_datasets = load_dataset("alexcadillon/SemEval2014Task4", 'restaurants')
raw_datasets

Downloading builder script: 100%|██████████| 10.0k/10.0k [00:00<00:00, 6.94MB/s]
Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]
Downloading data: 100%|██████████| 35.7k/35.7k [00:00<00:00, 17.9MB/s]
Downloading data files:  33%|███▎      | 1/3 [00:00<00:01,  1.49it/s]
Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s][A
Downloading data:  16%|█▌        | 198k/1.24M [00:00<00:00, 1.89MB/s][A
Downloading data: 100%|██████████| 1.24M/1.24M [00:00<00:00, 5.91MB/s][A
Downloading data files:  67%|██████▋   | 2/3 [00:01<00:00,  1.25it/s]
Downloading data:   0%|          | 0.00/359k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 359k/359k [00:00<00:00, 2.37MB/s][A
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.27it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 115.31it/s]
Generating trial split: 100 examples [00:00, 1025.54 examples/s]
Generating train split: 3041 examples [00:00, 10711.67 examples/s]
Generating test split: 800

DatasetDict({
    trial: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 100
    })
    train: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories'],
        num_rows: 800
    })
})

In [3]:
#!g2.1
categories = []
for i in raw_datasets['train']['aspectCategories']:
    if i[0]['category'] not in categories:
        categories.append(i[0]['category'])
categories    

['service', 'food', 'anecdotes/miscellaneous', 'ambience', 'price']

In [4]:
#!g2.1
labels = ['service', 'food', 'anecdotes/miscellaneous', 'ambience', 'price']


In [5]:
#!g2.1
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
label2id

{'service': 0,
 'food': 1,
 'anecdotes/miscellaneous': 2,
 'ambience': 3,
 'price': 4}

In [6]:
#!g2.1
sample = raw_datasets['train'][5]
sample

{'sentenceId': '2846',
 'text': "Not only was the food outstanding, but the little 'perks' were great.",
 'aspectTerms': [{'term': 'food',
   'polarity': 'positive',
   'from': '17',
   'to': '21'},
  {'term': 'perks', 'polarity': 'positive', 'from': '51', 'to': '56'}],
 'aspectCategories': [{'category': 'food', 'polarity': 'positive'},
  {'category': 'service', 'polarity': 'positive'}]}

In [12]:
#!g2.1
def combo_label(example):   
    labels_combo = []
    for item in example['aspectCategories']:
        labels_combo.append(f"{item['category']}")
    example['labels_combo'] = labels_combo
    return example

In [13]:
#!g2.1
dataset = raw_datasets.map(combo_label)
dataset

Map: 100%|██████████| 100/100 [00:00<00:00, 5672.65 examples/s]
Map: 100%|██████████| 3041/3041 [00:00<00:00, 11690.95 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 11480.04 examples/s]


DatasetDict({
    trial: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories', 'labels_combo'],
        num_rows: 100
    })
    train: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories', 'labels_combo'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['sentenceId', 'text', 'aspectTerms', 'aspectCategories', 'labels_combo'],
        num_rows: 800
    })
})

In [14]:
#!g2.1
sample = dataset['train'][0]
sample

{'sentenceId': '3121',
 'text': 'But the staff was so horrible to us.',
 'aspectTerms': [{'term': 'staff',
   'polarity': 'negative',
   'from': '8',
   'to': '13'}],
 'aspectCategories': [{'category': 'service', 'polarity': 'negative'}],
 'labels_combo': ['service']}

In [15]:
#!g2.1
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("albert-xxlarge-v2")
label_counts = len(labels)

def preprocess_data(example):
  # take a batch of texts
  text = example["text"]
  # encode them
  encoding = tokenizer(text, add_special_tokens=True)
  # add labels
  label_ids = [0 for i in range(label_counts)]
  for item in example['labels_combo']:
            new = [1.0 if l == labels.index(item) else 0.0 for l in range(label_counts)]
            label_ids = [x+y for x,y in zip(label_ids, new)]
            new = [0 for i in range(label_counts)]

  encoding["labels"] = label_ids

  return encoding

In [16]:
#!g2.1
preprocess_data(sample)

{'input_ids': [2, 47, 14, 1138, 23, 86, 9244, 20, 182, 9, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1.0, 0.0, 0.0, 0.0, 0.0]}

In [17]:
#!g2.1
tokenized_dataset = dataset.map(preprocess_data, remove_columns=dataset['train'].column_names)
tokenized_dataset

Map: 100%|██████████| 100/100 [00:00<00:00, 2467.12 examples/s]
Map: 100%|██████████| 3041/3041 [00:00<00:00, 3267.22 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 3883.64 examples/s]


DatasetDict({
    trial: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
})

In [18]:
#!g2.1
raw_datasets['train'][15]

{'sentenceId': '3359',
 'text': 'The pizza is the best if you like thin crusted pizza.',
 'aspectTerms': [{'term': 'pizza',
   'polarity': 'positive',
   'from': '4',
   'to': '9'},
  {'term': 'thin crusted pizza',
   'polarity': 'neutral',
   'from': '34',
   'to': '52'}],
 'aspectCategories': [{'category': 'food', 'polarity': 'positive'}]}

In [19]:
#!g2.1
example = tokenized_dataset['train'][15]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [20]:
#!g2.1
tokenizer.decode(example['input_ids'])



'[CLS] the pizza is the best if you like thin crusted pizza.[SEP]'

In [21]:
#!g2.1
example['labels']

[0.0, 1.0, 0.0, 0.0, 0.0]

In [22]:
#!g2.1
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['food']

In [23]:
#!g2.1
tokenized_dataset.set_format("torch")

In [24]:
#!g2.1
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("albert-xxlarge-v2", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors: 100%|██████████| 893M/893M [00:13<00:00, 65.0MB/s] 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xxlarge-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
#!g2.1
batch_size = 8

In [26]:
#!g2.1
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = True)

In [27]:
#!g2.1
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    #push_to_hub=True,
)

In [28]:
#!g2.1
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [29]:
#!g2.1
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

trainer.train()

  0%|          | 0/1905 [00:00<?, ?it/s]You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 20%|██        | 381/1905 [02:27<08:30,  2.99it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:07, 13.78it/s][A
  5%|▌         | 5/100 [00:00<00:08, 11.04it/s][A
  7%|▋         | 7/100 [00:00<00:09,  9.42it/s][A
  8%|▊         | 8/100 [00:00<00:10,  9.10it/s][A
  9%|▉         | 9/100 [00:00<00:09,  9.28it/s][A
 10%|█         | 10/100 [00:01<00:09,  9.44it/s][A
 11%|█         | 11/100 [00:01<00:09,  9.05it/s][A
 13%|█▎        | 13/100 [00:01<00:08, 10.13it/s][A
 15%|█▌        | 15/100 [00:01<00:08, 10.12it/s][A
 17%|█▋        | 17/100 [00:01<00:08,  9.39it/s][A
 18%|█▊        | 18/100 [00:01<00:08,  9.49it/s][A
 19%|█▉        | 19/100 [00:02<00:09,  8.61it/s][A
 21%|██        | 21

{'eval_loss': 0.14634805917739868, 'eval_f1': 0.8792216546313046, 'eval_roc_auc': 0.908165687613814, 'eval_accuracy': 0.81375, 'eval_runtime': 12.1404, 'eval_samples_per_second': 65.895, 'eval_steps_per_second': 8.237, 'epoch': 1.0}


 26%|██▌       | 500/1905 [03:26<09:37,  2.43it/s]  

{'loss': 0.2673, 'learning_rate': 1.4750656167979002e-05, 'epoch': 1.31}


 40%|████      | 762/1905 [05:05<06:04,  3.14it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:07, 13.81it/s][A
  5%|▌         | 5/100 [00:00<00:08, 11.02it/s][A
  7%|▋         | 7/100 [00:00<00:09,  9.39it/s][A
  8%|▊         | 8/100 [00:00<00:10,  9.06it/s][A
  9%|▉         | 9/100 [00:00<00:09,  9.24it/s][A
 10%|█         | 10/100 [00:01<00:09,  9.42it/s][A
 11%|█         | 11/100 [00:01<00:09,  9.05it/s][A
 13%|█▎        | 13/100 [00:01<00:08, 10.15it/s][A
 15%|█▌        | 15/100 [00:01<00:08, 10.12it/s][A
 17%|█▋        | 17/100 [00:01<00:08,  9.38it/s][A
 18%|█▊        | 18/100 [00:01<00:08,  9.46it/s][A
 19%|█▉        | 19/100 [00:02<00:09,  8.58it/s][A
 20%|██        | 20/100 [00:02<00:09,  8.88it/s][A
 22%|██▏       | 22/100 [00:02<00:08,  9.39it/s][A
 23%|██▎       | 23/100 [00:02<00:08,  9.50it/s][A
 25%|██▌       | 25/100 [00:02<00:07,  9.97it/s][A
 26%|██▌       | 26/100 [00:02<00:07,  9.51it/s][A
 27%|██▋       | 27/100 [00

{'eval_loss': 0.14918918907642365, 'eval_f1': 0.9201353382009939, 'eval_roc_auc': 0.9395855998455869, 'eval_accuracy': 0.8525, 'eval_runtime': 12.1397, 'eval_samples_per_second': 65.899, 'eval_steps_per_second': 8.237, 'epoch': 2.0}


 52%|█████▏    | 1000/1905 [06:50<05:37,  2.68it/s] 

{'loss': 0.1008, 'learning_rate': 9.501312335958006e-06, 'epoch': 2.62}


 60%|██████    | 1143/1905 [07:45<03:31,  3.61it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:07, 13.80it/s][A
  5%|▌         | 5/100 [00:00<00:08, 11.00it/s][A
  7%|▋         | 7/100 [00:00<00:09,  9.40it/s][A
  8%|▊         | 8/100 [00:00<00:10,  9.07it/s][A
  9%|▉         | 9/100 [00:00<00:09,  9.25it/s][A
 10%|█         | 10/100 [00:01<00:09,  9.43it/s][A
 11%|█         | 11/100 [00:01<00:09,  9.05it/s][A
 13%|█▎        | 13/100 [00:01<00:08, 10.13it/s][A
 15%|█▌        | 15/100 [00:01<00:08, 10.11it/s][A
 17%|█▋        | 17/100 [00:01<00:08,  9.37it/s][A
 18%|█▊        | 18/100 [00:01<00:08,  9.47it/s][A
 19%|█▉        | 19/100 [00:02<00:09,  8.60it/s][A
 21%|██        | 21/100 [00:02<00:08,  9.59it/s][A
 22%|██▏       | 22/100 [00:02<00:08,  9.24it/s][A
 23%|██▎       | 23/100 [00:02<00:08,  9.38it/s][A
 25%|██▌       | 25/100 [00:02<00:07,  9.93it/s][A
 26%|██▌       | 26/100 [00:02<00:07,  9.50it/s][A
 27%|██▋       | 27/100 [0

{'eval_loss': 0.1345946192741394, 'eval_f1': 0.9279568449839622, 'eval_roc_auc': 0.9516486013557925, 'eval_accuracy': 0.86625, 'eval_runtime': 12.137, 'eval_samples_per_second': 65.914, 'eval_steps_per_second': 8.239, 'epoch': 3.0}


 79%|███████▊  | 1500/1905 [10:13<02:12,  3.06it/s]

{'loss': 0.0385, 'learning_rate': 4.251968503937008e-06, 'epoch': 3.94}


 80%|████████  | 1524/1905 [10:23<02:41,  2.36it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:07, 13.85it/s][A
  5%|▌         | 5/100 [00:00<00:08, 11.08it/s][A
  7%|▋         | 7/100 [00:00<00:09,  9.36it/s][A
  8%|▊         | 8/100 [00:00<00:10,  9.04it/s][A
  9%|▉         | 9/100 [00:00<00:09,  9.23it/s][A
 10%|█         | 10/100 [00:01<00:09,  9.41it/s][A
 11%|█         | 11/100 [00:01<00:09,  8.98it/s][A
 13%|█▎        | 13/100 [00:01<00:08, 10.08it/s][A
 14%|█▍        | 14/100 [00:01<00:08, 10.01it/s][A
 15%|█▌        | 15/100 [00:01<00:08,  9.98it/s][A
 16%|█▌        | 16/100 [00:01<00:08,  9.37it/s][A
 17%|█▋        | 17/100 [00:01<00:09,  9.02it/s][A
 18%|█▊        | 18/100 [00:01<00:08,  9.20it/s][A
 19%|█▉        | 19/100 [00:02<00:09,  8.22it/s][A
 21%|██        | 21/100 [00:02<00:08,  9.47it/s][A
 22%|██▏       | 22/100 [00:02<00:08,  9.11it/s][A
 23%|██▎       | 23/100 [00:02<00:08,  9.29it/s][A
 25%|██▌       | 25/100 [0

{'eval_loss': 0.1539347618818283, 'eval_f1': 0.9332828420197249, 'eval_roc_auc': 0.9543375297253321, 'eval_accuracy': 0.86875, 'eval_runtime': 12.1395, 'eval_samples_per_second': 65.9, 'eval_steps_per_second': 8.238, 'epoch': 4.0}


100%|██████████| 1905/1905 [13:02<00:00,  3.72it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  3%|▎         | 3/100 [00:00<00:06, 13.91it/s][A
  5%|▌         | 5/100 [00:00<00:08, 11.00it/s][A
  7%|▋         | 7/100 [00:00<00:09,  9.37it/s][A
  8%|▊         | 8/100 [00:00<00:10,  9.04it/s][A
  9%|▉         | 9/100 [00:00<00:09,  9.24it/s][A
 10%|█         | 10/100 [00:01<00:09,  9.42it/s][A
 11%|█         | 11/100 [00:01<00:09,  9.06it/s][A
 13%|█▎        | 13/100 [00:01<00:08, 10.13it/s][A
 15%|█▌        | 15/100 [00:01<00:08, 10.11it/s][A
 17%|█▋        | 17/100 [00:01<00:08,  9.35it/s][A
 18%|█▊        | 18/100 [00:01<00:08,  9.41it/s][A
 19%|█▉        | 19/100 [00:02<00:09,  8.56it/s][A
 21%|██        | 21/100 [00:02<00:08,  9.56it/s][A
 22%|██▏       | 22/100 [00:02<00:08,  9.22it/s][A
 23%|██▎       | 23/100 [00:02<00:08,  9.39it/s][A
 25%|██▌       | 25/100 [00:02<00:07,  9.96it/s][A
 26%|██▌       | 26/100 [00:02<00:07,  9.51it/s][A
 28%|██▊       | 28/100 [0

{'eval_loss': 0.15995360910892487, 'eval_f1': 0.9344935829831197, 'eval_roc_auc': 0.953860251743541, 'eval_accuracy': 0.8725, 'eval_runtime': 12.1211, 'eval_samples_per_second': 66.001, 'eval_steps_per_second': 8.25, 'epoch': 5.0}
{'train_runtime': 794.8816, 'train_samples_per_second': 19.129, 'train_steps_per_second': 2.397, 'train_loss': 0.10925025113924282, 'epoch': 5.0}





TrainOutput(global_step=1905, training_loss=0.10925025113924282, metrics={'train_runtime': 794.8816, 'train_samples_per_second': 19.129, 'train_steps_per_second': 2.397, 'train_loss': 0.10925025113924282, 'epoch': 5.0})