In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

import random
import numpy as np
from sklearn.metrics import classification_report

## Parameters

In [None]:
LR = 2e-5
EPOCHS = 3
BATCH_SIZE = 32
#MODEL = "cardiffnlp/twitter-xlm-roberta-base" # use this to finetune the language model
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment" # use this to finetune the sentiment classifier
MAX_TRAINING_EXAMPLES = 1000 # set this to -1 if you want to use the whole training set

## Data

Testing the model with German sentiment data\
https://github.com/oliverguhr/german-sentiment?tab=readme-ov-file#data-sets

In [None]:
test_text = open("/kaggle/input/fine-tuning-sentiment-analysis-data/test_text.txt", encoding='latin-1').read().rstrip('\n').split('\n')
# test_labels = open("/kaggle/input/fine-tuning-sentiment-analysis-data/test_labels.txt", encoding='latin-1').read().rstrip('\n').split('\n')

In [None]:
with open("/kaggle/input/fine-tuning-sentiment-analysis-data/test_labels.txt", encoding='latin-1') as file:
    test_labels = [int(line.strip()) for line in file]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
test_encodings = tokenizer(test_text, max_length=128, truncation=True, padding=True)

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

test_dataset = MyDataset(test_encodings, test_labels)

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Use batch processing for faster testing

In [None]:
def predict(model, data_loader):
    model.eval()
    predictions = []
    real_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            real_labels.extend(labels.cpu().numpy())
    return predictions, real_labels

In [None]:
predictions, real_labels = predict(model, test_loader)

In [None]:
report = classification_report(real_labels, predictions, target_names=['Negative', 'Neutral', 'Positive'])
print(report)