In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
import random
import numpy as np
from sklearn.metrics import classification_report



In [2]:
german_sentiment_path = "oliverguhr/german-sentiment-bert"
xlm_t_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

In [3]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
def test_classification_report(model, test_text, test_labels):
    model.eval()
    predictions = []
    real_labels = []
    test_encodings = tokenizer(test_text, max_length=128, truncation=True, padding=True)
    test_dataset = MyDataset(test_encodings, test_labels)
    data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            real_labels.extend(labels.cpu().numpy())
    report = classification_report(real_labels, predictions, target_names=['Negative', 'Neutral', 'Positive'])
    print(report)

In [5]:
test_text_noscare = open("/kaggle/input/germeval-no-scare/test_text.txt", encoding='latin-1').read().rstrip('\n').split('\n')

In [6]:
with open("/kaggle/input/germeval-no-scare/test_labels.txt", encoding='latin-1') as file:
    test_labels_noscare = [int(line.strip()) for line in file]

In [7]:
# model = AutoModelForSequenceClassification.from_pretrained(german_sentiment_path)
# tokenizer = AutoTokenizer.from_pretrained(german_sentiment_path, use_fast=True)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(xlm_t_path)
tokenizer = AutoTokenizer.from_pretrained(xlm_t_path, use_fast=True)

config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [9]:
test_classification_report(model, test_text_noscare, test_labels_noscare)

              precision    recall  f1-score   support

    Negative       0.68      0.86      0.76    238676
     Neutral       0.29      0.77      0.42     41398
    Positive       0.97      0.75      0.84    579501

    accuracy                           0.78    859575
   macro avg       0.65      0.79      0.68    859575
weighted avg       0.85      0.78      0.80    859575

