<a href="https://colab.research.google.com/github/LorenzoAgnolucci/BERT_for_ABSA/blob/master/BERT_for_ABSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---


# BERT-pair


---



In [1]:
#@title Choose a dataset and a task { run: "auto", display-mode: "form" }
base_dir = "C:\\Users\\prave\\Repos\\BERT_for_ABSA\\" #@param {type:"string"}
dataset_type = "sentihood" #@param ["sentihood", "semeval2014"]
task = "NLI_M" #@param ["QA_M", "NLI_M", "QA_B", "NLI_B"]

In [2]:
import pandas as pd
import random

if dataset_type == "sentihood":
    id2label = {0: "None", 1: "Positive", 2: "Negative"}
    label2id = {"None": 0, "Positive": 1, "Negative": 2}
elif dataset_type == "semeval2014":
    id2label = {0: "positive", 1: "neutral", 2: "negative", 3: "conflict", 4: "none"}
    label2id = {"positive": 0, "neutral" : 1, "negative" : 2, "conflict": 3, "none": 4}

if task.endswith("B"):
    num_classes = 2
else:
    if dataset_type == "sentihood":
        num_classes = 3
    elif dataset_type == "semeval2014":
        num_classes = 5

def get_dataset(path):
    original_sentences = []
    auxiliary_sentences = []
    labels = []
    data = pd.read_csv(path, header=0, sep="\t").values.tolist()
    for row in data:
        original_sentences.append(row[1])
        auxiliary_sentences.append(row[2])
        labels.append(row[3])
    return original_sentences, auxiliary_sentences, labels

train_original_sentences, train_auxiliary_sentences, train_labels = get_dataset(f"{base_dir}\\data\\{dataset_type}\\BERT-pair\\train_{task}.csv")
if dataset_type == "sentihood":
    val_original_sentences, val_auxiliary_sentences, val_labels = get_dataset(f"{base_dir}\\data\\{dataset_type}\\BERT-pair\\dev_{task}.csv")
elif dataset_type == "semeval2014":
    val_original_sentences, val_auxiliary_sentences, val_labels = get_dataset(f"{base_dir}\\data\\{dataset_type}\\BERT-pair\\test_{task}.csv")
test_original_sentences, test_auxiliary_sentences, test_labels = get_dataset(f"{base_dir}\\data\\{dataset_type}\\BERT-pair\\test_{task}.csv")

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_original_sentences, train_auxiliary_sentences, truncation=True, padding=True)
val_encodings = tokenizer(val_original_sentences, val_auxiliary_sentences, truncation=True, padding=True)
test_encodings = tokenizer(test_original_sentences, test_auxiliary_sentences, truncation=True, padding=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch

class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ABSA_Dataset(train_encodings, train_labels)
val_dataset = ABSA_Dataset(val_encodings, val_labels)
test_dataset = ABSA_Dataset(test_encodings, test_labels)

In [5]:
import sys
if base_dir not in sys.path:
    sys.path.insert(0, f'{base_dir}/')
import numpy as np
from scipy.special import softmax
import evaluation


def get_test_labels(data_dir, dataset_type):
    original_sentences = []
    auxiliary_sentences = []
    labels = []
    data = pd.read_csv(f"{data_dir}/{dataset_type}/BERT-pair/test_NLI_M.csv", header=0, sep="\t").values.tolist()
    for row in data:
        labels.append(row[3])
    return labels


def get_predictions(data, task, dataset_type):
    predicted_labels = []
    scores = []
    if task.endswith("B"):
        if dataset_type == "sentihood":
            if task.endswith("B"):
                count_aspect_rows = 0
                current_aspect_scores = []
                for row in data:
                    current_aspect_scores.append(row[2])
                    count_aspect_rows += 1
                    if count_aspect_rows % 3 == 0:
                        sum_current_aspect_scores = np.sum(current_aspect_scores)
                        current_aspect_scores = [score / sum_current_aspect_scores for score in current_aspect_scores]
                        scores.append(current_aspect_scores)
                        predicted_labels.append(np.argmax(current_aspect_scores))
                        current_aspect_scores = []
        elif dataset_type == "semeval2014":
            if task.endswith("B"):
                count_aspect_rows = 0
                current_aspect_scores = []
                for row in data:
                    current_aspect_scores.append(row[2])
                    count_aspect_rows += 1
                    if count_aspect_rows % 5 == 0:
                        sum_current_aspect_scores = np.sum(current_aspect_scores)
                        current_aspect_scores = [score / sum_current_aspect_scores for score in current_aspect_scores]
                        scores.append(current_aspect_scores)
                        predicted_labels.append(np.argmax(current_aspect_scores))
                        current_aspect_scores = []
    return predicted_labels, scores


if dataset_type == "sentihood":
    def compute_metrics(predictions):
        scores = [softmax(prediction) for prediction in predictions[0]]
        predicted_labels = [np.argmax(x) for x in scores]
        if task.endswith("B"):
            data = np.insert(scores, 0, predicted_labels, axis=1)
            predicted_labels, scores = get_predictions(data, task, dataset_type)
        test_labels = get_test_labels(f"{base_dir}/data", dataset_type)
        metrics = {}
        metrics["strict_acc"] = evaluation.compute_sentihood_aspect_strict_accuracy(test_labels, predicted_labels)
        metrics["F1"] = evaluation.compute_sentihood_aspect_macro_F1(test_labels, predicted_labels)
        metrics["aspect_AUC"] = evaluation.compute_sentihood_aspect_macro_AUC(test_labels, scores)
        sentiment_macro_AUC, sentiment_accuracy = evaluation.compute_sentihood_sentiment_classification_metrics(test_labels, scores)
        metrics["sentiment_acc"] = sentiment_accuracy
        metrics["sentiment_AUC"] = sentiment_macro_AUC
        return metrics

elif dataset_type == "semeval2014":
    def compute_metrics(predictions):
        scores = [softmax(prediction) for prediction in predictions[0]]
        predicted_labels = [np.argmax(x) for x in scores]
        if task.endswith("B"):
            data = np.insert(scores, 0, predicted_labels, axis=1)
            predicted_labels, scores = get_predictions(data, task, dataset_type)
        test_labels = get_test_labels(f"{base_dir}/data", dataset_type)
        metrics = {}
        p, r, f1 = evaluation.compute_semeval_PRF(test_labels, predicted_labels)
        metrics["P"] = p
        metrics["R"] = r
        metrics["F1"] = f1
        metrics["4-way"] = evaluation.compute_semeval_accuracy(test_labels, predicted_labels, scores, 4)
        metrics["3-way"] = evaluation.compute_semeval_accuracy(test_labels, predicted_labels, scores, 3)
        metrics["binary"] = evaluation.compute_semeval_accuracy(test_labels, predicted_labels, scores, 2)
        return metrics

In [19]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertConfig

from transformers import logging
logging.set_verbosity_debug()

epochs = 4
batch_size = 24
num_steps = len(train_dataset) * epochs // batch_size
warmup_steps = num_steps // 10  # 10% of the training steps
save_steps = num_steps // epochs    # Save a checkpoint at the end of each epoch


training_args = TrainingArguments(
    output_dir = f'{base_dir}models\\{dataset_type}\\\BERT-pair\\{task}\\',          
    num_train_epochs = epochs,              
    per_device_train_batch_size = batch_size,  
    per_device_eval_batch_size = batch_size,   
    warmup_steps = warmup_steps,   
    weight_decay = 0.01,               
    logging_dir = f'{base_dir}logs\\{dataset_type}\\BERT-pair\\{task}\\',            
    logging_steps = 10,
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    save_steps = save_steps
)

config = BertConfig.from_pretrained(
    'bert-base-uncased',
    architectures = ['BertForSequenceClassification'],
    hidden_size = 768,
    num_hidden_layers = 12,
    num_attention_heads = 12,
    hidden_dropout_prob = 0.1,
    num_labels = num_classes
)    

load_finetuned_model = False
if not load_finetuned_model:
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics             
    )
    trainer.train()

    model.save_pretrained(f"{base_dir}models\\{dataset_type}\\BERT-pair\\{task}\\last_step")

else:
    model = BertForSequenceClassification.from_pretrained(f"{base_dir}models\\{dataset_type}\\BERT-pair\\{task}\\last_step")

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics             
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at C:\Users\prave\.cache\huggingface\hub\models--bert-base-uncased\snapshots\86b5e0934494bd15c9632b12f734a8a67f723594\config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_e

{'loss': 1.0175, 'grad_norm': 12.165437698364258, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.02}


  1%|          | 20/2504 [06:13<16:01:27, 23.22s/it]

{'loss': 0.9781, 'grad_norm': 14.183942794799805, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.03}


  1%|          | 30/2504 [09:55<14:49:05, 21.56s/it]

{'loss': 0.8908, 'grad_norm': 15.549015045166016, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.05}


  2%|▏         | 40/2504 [13:50<16:38:53, 24.32s/it]

{'loss': 0.9142, 'grad_norm': 15.271132469177246, 'learning_rate': 3.2000000000000003e-06, 'epoch': 0.06}


  2%|▏         | 50/2504 [18:03<17:12:45, 25.25s/it]

{'loss': 0.8308, 'grad_norm': 7.778471946716309, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.08}


  2%|▏         | 60/2504 [22:11<16:46:43, 24.71s/it]

{'loss': 0.6898, 'grad_norm': 4.74627161026001, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.1}


  3%|▎         | 70/2504 [26:58<19:55:22, 29.47s/it]

{'loss': 0.6306, 'grad_norm': 7.399055480957031, 'learning_rate': 5.600000000000001e-06, 'epoch': 0.11}


  3%|▎         | 80/2504 [31:50<19:38:15, 29.16s/it]

{'loss': 0.5204, 'grad_norm': 2.0145955085754395, 'learning_rate': 6.4000000000000006e-06, 'epoch': 0.13}


  4%|▎         | 90/2504 [36:41<19:23:06, 28.91s/it]

{'loss': 0.5139, 'grad_norm': 2.4655892848968506, 'learning_rate': 7.2000000000000005e-06, 'epoch': 0.14}


  4%|▍         | 100/2504 [41:43<21:28:57, 32.17s/it]

{'loss': 0.6115, 'grad_norm': 5.680858612060547, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.16}


  4%|▍         | 110/2504 [45:55<16:54:00, 25.41s/it]

{'loss': 0.521, 'grad_norm': 2.5881359577178955, 'learning_rate': 8.8e-06, 'epoch': 0.18}


  5%|▍         | 120/2504 [50:06<16:37:52, 25.11s/it]

{'loss': 0.5502, 'grad_norm': 2.3802146911621094, 'learning_rate': 9.600000000000001e-06, 'epoch': 0.19}


  5%|▌         | 130/2504 [54:24<17:10:27, 26.04s/it]

{'loss': 0.4964, 'grad_norm': 2.8707191944122314, 'learning_rate': 1.04e-05, 'epoch': 0.21}


  6%|▌         | 140/2504 [58:43<17:00:32, 25.90s/it]

{'loss': 0.4114, 'grad_norm': 3.038390874862671, 'learning_rate': 1.1200000000000001e-05, 'epoch': 0.22}


  6%|▌         | 150/2504 [1:03:00<16:47:26, 25.68s/it]

{'loss': 0.5181, 'grad_norm': 4.402386665344238, 'learning_rate': 1.2e-05, 'epoch': 0.24}


  6%|▋         | 160/2504 [1:07:20<16:51:26, 25.89s/it]

{'loss': 0.5087, 'grad_norm': 2.569650173187256, 'learning_rate': 1.2800000000000001e-05, 'epoch': 0.26}


  7%|▋         | 170/2504 [1:11:40<16:47:32, 25.90s/it]

{'loss': 0.5129, 'grad_norm': 5.730218887329102, 'learning_rate': 1.3600000000000002e-05, 'epoch': 0.27}


  7%|▋         | 180/2504 [1:15:59<16:43:27, 25.91s/it]

{'loss': 0.4928, 'grad_norm': 3.384443998336792, 'learning_rate': 1.4400000000000001e-05, 'epoch': 0.29}


  8%|▊         | 190/2504 [1:20:17<16:35:23, 25.81s/it]

{'loss': 0.4594, 'grad_norm': 4.136019706726074, 'learning_rate': 1.5200000000000002e-05, 'epoch': 0.3}


  8%|▊         | 200/2504 [1:24:35<16:27:34, 25.72s/it]

{'loss': 0.5071, 'grad_norm': 3.2512004375457764, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.32}


  8%|▊         | 210/2504 [1:28:55<16:38:41, 26.12s/it]

{'loss': 0.4471, 'grad_norm': 2.8167428970336914, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.34}


  9%|▉         | 220/2504 [1:33:12<16:15:58, 25.64s/it]

{'loss': 0.5696, 'grad_norm': 5.386629581451416, 'learning_rate': 1.76e-05, 'epoch': 0.35}


  9%|▉         | 230/2504 [1:37:30<16:24:01, 25.96s/it]

{'loss': 0.4915, 'grad_norm': 3.7801308631896973, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.37}


 10%|▉         | 240/2504 [1:41:50<16:18:09, 25.92s/it]

{'loss': 0.4188, 'grad_norm': 3.1885814666748047, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.38}


 10%|▉         | 250/2504 [1:46:02<15:38:10, 24.97s/it]

{'loss': 0.49, 'grad_norm': 4.234561443328857, 'learning_rate': 2e-05, 'epoch': 0.4}


 10%|█         | 260/2504 [1:50:13<15:35:28, 25.01s/it]

{'loss': 0.4388, 'grad_norm': 3.7194483280181885, 'learning_rate': 1.9911268855368235e-05, 'epoch': 0.42}


 11%|█         | 270/2504 [1:54:23<15:34:06, 25.09s/it]

{'loss': 0.4053, 'grad_norm': 4.035529613494873, 'learning_rate': 1.982253771073647e-05, 'epoch': 0.43}


 11%|█         | 280/2504 [1:59:02<18:44:41, 30.34s/it]

{'loss': 0.3454, 'grad_norm': 14.92529010772705, 'learning_rate': 1.9733806566104706e-05, 'epoch': 0.45}


 12%|█▏        | 290/2504 [2:02:46<14:10:03, 23.04s/it]

{'loss': 0.4042, 'grad_norm': 5.1077704429626465, 'learning_rate': 1.964507542147294e-05, 'epoch': 0.46}


 12%|█▏        | 300/2504 [2:06:53<15:09:58, 24.77s/it]

{'loss': 0.3954, 'grad_norm': 3.977452039718628, 'learning_rate': 1.9556344276841174e-05, 'epoch': 0.48}


 12%|█▏        | 310/2504 [2:11:06<15:22:35, 25.23s/it]

{'loss': 0.3815, 'grad_norm': 6.658669948577881, 'learning_rate': 1.9467613132209407e-05, 'epoch': 0.5}


 13%|█▎        | 320/2504 [2:15:16<15:09:09, 24.98s/it]

{'loss': 0.4061, 'grad_norm': 7.621420383453369, 'learning_rate': 1.937888198757764e-05, 'epoch': 0.51}


 13%|█▎        | 330/2504 [2:19:25<15:04:43, 24.97s/it]

{'loss': 0.4449, 'grad_norm': 2.777791976928711, 'learning_rate': 1.9290150842945875e-05, 'epoch': 0.53}


 14%|█▎        | 340/2504 [2:23:35<15:14:41, 25.36s/it]

{'loss': 0.3933, 'grad_norm': 4.976013660430908, 'learning_rate': 1.920141969831411e-05, 'epoch': 0.54}


 14%|█▍        | 350/2504 [2:28:50<18:58:32, 31.71s/it]

{'loss': 0.2729, 'grad_norm': 2.95953369140625, 'learning_rate': 1.9112688553682342e-05, 'epoch': 0.56}


 14%|█▍        | 360/2504 [2:33:38<17:54:22, 30.07s/it]

{'loss': 0.3527, 'grad_norm': 6.336037635803223, 'learning_rate': 1.9023957409050576e-05, 'epoch': 0.58}


 15%|█▍        | 370/2504 [2:38:36<17:57:32, 30.30s/it]

{'loss': 0.3013, 'grad_norm': 5.022852420806885, 'learning_rate': 1.8935226264418813e-05, 'epoch': 0.59}


 15%|█▌        | 380/2504 [2:43:38<17:38:16, 29.89s/it]

{'loss': 0.3622, 'grad_norm': 3.506969690322876, 'learning_rate': 1.8846495119787047e-05, 'epoch': 0.61}


 16%|█▌        | 390/2504 [2:48:38<17:36:05, 29.97s/it]

{'loss': 0.4093, 'grad_norm': 9.562499046325684, 'learning_rate': 1.875776397515528e-05, 'epoch': 0.62}


 16%|█▌        | 400/2504 [2:53:36<17:22:32, 29.73s/it]

{'loss': 0.2731, 'grad_norm': 6.805218696594238, 'learning_rate': 1.8669032830523514e-05, 'epoch': 0.64}


 16%|█▋        | 410/2504 [2:58:18<15:57:43, 27.44s/it]

{'loss': 0.2188, 'grad_norm': 4.388709545135498, 'learning_rate': 1.8580301685891748e-05, 'epoch': 0.65}


 17%|█▋        | 420/2504 [3:03:06<16:44:13, 28.91s/it]

{'loss': 0.3404, 'grad_norm': 6.490188121795654, 'learning_rate': 1.8491570541259985e-05, 'epoch': 0.67}


 17%|█▋        | 430/2504 [3:07:48<16:30:46, 28.66s/it]

{'loss': 0.2398, 'grad_norm': 3.8625547885894775, 'learning_rate': 1.840283939662822e-05, 'epoch': 0.69}


 17%|█▋        | 433/2504 [3:09:09<15:41:08, 27.27s/it]

In [None]:
evaluation_result = trainer.evaluate(test_dataset)
print(evaluation_result)



---


# BERT-pair


---



In [None]:
import pandas as pd

results = trainer.predict(test_dataset)

scores = [softmax(prediction) for prediction in results.predictions]
predicted_labels = [np.argmax(x) for x in scores]

In [None]:
csv_output = np.insert(scores, 0, predicted_labels, axis=1)
df = pd.DataFrame(csv_output)
df[0] = df[0].astype("int")
if task.endswith("B"):
    header = ["predicted_label", "no", "yes"]
else:
    header = ["predicted_label"]
    for label in label2id.keys():
        header.append(label)
df.to_csv(f"{base_dir}/results/{dataset_type}/BERT-pair/{task}.csv", index=False, header=header)

In [None]:
import sys
if base_dir not in sys.path:
    sys.path.insert(0, f'{base_dir}/')
import evaluation
evaluation.main(task, dataset_type, f"{base_dir}/data", f"{base_dir}/results")



---


# BERT-single


---



In [None]:
#@title Choose a dataset and a task { run: "auto", display-mode: "form" }
base_dir = "/gdrive/MyDrive/Machine_Learning" #@param {type:"string"}
dataset_type = "semeval2014" #@param ["sentihood", "semeval2014"]
task = "single" #@param ["single"]

In [None]:
import pandas as pd
import random


if dataset_type == "sentihood":
    id2label = {0: "None", 1: "Positive", 2: "Negative"}
    label2id = {"None": 0, "Positive": 1, "Negative": 2}
elif dataset_type == "semeval2014":
    id2label = {0: "positive", 1: "neutral", 2: "negative", 3: "conflict", 4: "none"}
    label2id = {"positive": 0, "neutral" : 1, "negative" : 2, "conflict": 3, "none": 4}

if dataset_type == "sentihood":
    num_classes = 3
    locations = ["location_1_", "location_2_"]
    aspects = ["general", "price", "safety", "transit location"]
elif dataset_type == "semeval2014":
    num_classes = 5
    locations = [""]
    aspects = ["ambience", "anecdotes", "food", "price", "service"]


def get_dataset(path):
    original_sentences = []
    labels = []
    data = pd.read_csv(path, header=0, sep="\t").values.tolist()
    for row in data:
        original_sentences.append(row[1])
        labels.append(row[3])
    return original_sentences, labels


train_original_sentences = {}
train_labels = {}
val_original_sentences = {}
val_labels = {}
test_original_sentences = {}
test_labels = {}

for location in locations:
    train_original_sentences[location] = {}
    train_labels[location] = {}
    val_original_sentences[location] = {}
    val_labels[location] = {}
    test_original_sentences[location] = {}
    test_labels[location] = {}
    for aspect in aspects:
        train_original_sentences[location][aspect], train_labels[location][aspect] = get_dataset(f"{base_dir}/data/{dataset_type}/BERT-single/{location}{aspect}/train.csv")
        if dataset_type == "sentihood":
            val_original_sentences[location][aspect], val_labels[location][aspect] = get_dataset(f"{base_dir}/data/{dataset_type}/BERT-single/{location}{aspect}/dev.csv")
        elif dataset_type == "semeval2014":
            val_original_sentences[location][aspect], val_labels[location][aspect] = get_dataset(f"{base_dir}/data/{dataset_type}/BERT-single/{location}{aspect}/test.csv")
        test_original_sentences[location][aspect], test_labels[location][aspect] = get_dataset(f"{base_dir}/data/{dataset_type}/BERT-single/{location}{aspect}/test.csv")

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = {}
val_encodings = {}
test_encodings = {}
for location in locations:
    train_encodings[location] = {}
    val_encodings[location] = {}
    test_encodings[location] = {}
    for aspect in aspects:
        train_encodings[location][aspect] = tokenizer(train_original_sentences[location][aspect], truncation=True, padding=True)
        val_encodings[location][aspect] = tokenizer(val_original_sentences[location][aspect], truncation=True, padding=True)
        test_encodings[location][aspect] = tokenizer(test_original_sentences[location][aspect], truncation=True, padding=True)

In [None]:
import torch

class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = {}
val_dataset = {}
test_dataset = {}
for location in locations:
    train_dataset[location] = {}
    val_dataset[location] = {}
    test_dataset[location] = {}
    for aspect in aspects:
        train_dataset[location][aspect] = ABSA_Dataset(train_encodings[location][aspect], train_labels[location][aspect])
        val_dataset[location][aspect] = ABSA_Dataset(val_encodings[location][aspect], val_labels[location][aspect])
        test_dataset[location][aspect] = ABSA_Dataset(test_encodings[location][aspect], test_labels[location][aspect])

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertConfig
from transformers import logging
import gc
import pandas as pd
import numpy as np
from scipy.special import softmax

logging.set_verbosity_debug()

epochs = 4
batch_size = 24

header = ["predicted_label"]
for label in label2id.keys():
    header.append(label)

config = BertConfig.from_pretrained(
        'bert-base-uncased',
        architectures = ['BertForSequenceClassification'],
        hidden_size = 768,
        num_hidden_layers = 12,
        num_attention_heads = 12,
        hidden_dropout_prob = 0.1,
        num_labels = num_classes
    )    

for location in locations:
    for aspect in aspects:
        num_steps = len(train_dataset[location][aspect]) * epochs // batch_size
        warmup_steps = num_steps // 10  # 10% of the training steps
        save_steps = num_steps // epochs    # Save a checkpoint at the end of each epoch


        training_args = TrainingArguments(
            output_dir = f'{base_dir}/models/{dataset_type}/BERT-single/{location}{aspect}/',          
            num_train_epochs = epochs,              
            per_device_train_batch_size = batch_size,  
            per_device_eval_batch_size = batch_size,   
            warmup_steps = warmup_steps,   
            weight_decay = 0.01,               
            logging_dir = f'{base_dir}/logs/{dataset_type}/BERT-single/{location}{aspect}/',            
            logging_steps = 10,
            evaluation_strategy = 'epoch',
            learning_rate = 2e-5,
            save_steps = save_steps,
            seed=21
        )

        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

        trainer = Trainer(
            model=model,                         
            args=training_args,                  
            train_dataset=train_dataset[location][aspect],         
            eval_dataset=val_dataset[location][aspect]             
        )

        trainer.train()

        model.save_pretrained(f"{base_dir}/models/{dataset_type}/BERT-single/{location}{aspect}/last_step")

        results = trainer.predict(test_dataset[location][aspect])

        scores = [softmax(prediction) for prediction in results.predictions]
        predicted_labels = [np.argmax(x) for x in scores]

        csv_output = np.insert(scores, 0, predicted_labels, axis=1)
        df = pd.DataFrame(csv_output)
        df[0] = df[0].astype("int")
        df.to_csv(f"{base_dir}/results/{dataset_type}/BERT-single/{location}{aspect}.csv", index=False, header=header)

        del training_args
        del model
        del trainer
        del results
        del scores
        del predicted_labels
        del csv_output
        del df
        gc.collect()

In [None]:
import sys
if base_dir not in sys.path:
    sys.path.insert(0, f'{base_dir}/')

import evaluation
evaluation.main(task, dataset_type, f"{base_dir}/data", f"{base_dir}/results")

In [None]:
location = "location_1_"
aspect = "general"

# Accessing tokenized training data
tokenized_data = train_encodings[location][aspect]

In [None]:
import torch

input_ids = torch.tensor(tokenized_data['input_ids'])
attention_mask = torch.tensor(tokenized_data['attention_mask'])

inputs = {
    'input_ids': input_ids.unsqueeze(0),  # Add batch dimension if not already present
    'attention_mask': attention_mask.unsqueeze(0)  # Add batch dimension if not already present
}

In [None]:
with torch.no_grad():  # Disable gradient calculation to save memory and computations
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state