In [None]:
from chat_GPT import Chat
import pandas as pd
from tqdm.auto import tqdm

Importing the test data

In [None]:
data = pd.read_csv("../data/test/clean_test.csv")
data.head(10)

Unnamed: 0,id,comment_text,label
0,0001ea8717f6de06,Thank you for understanding I think very highl...,0
1,000247e83dcc1211,Dear god this site is horrible,0
2,0002f87b16116a7f,Somebody will invariably try to add Religion ...,0
3,0003e1cccfd5a40a,It says it right there that it IS a type T...,0
4,00059ace3e3e9a53,Before adding a new product to the list m...,0
5,000663aff0fffc80,this other one from,0
6,000689dd34e20979,Reason for banning throwing This article ...,0
7,000844b52dee5f3f,blocked from editing Wikipedia,0
8,00091c35fa9d0465,Arabs are committing genocide in Iraq but no ...,1
9,000968ce11f5ee34,Please stop If you continue to vandalize Wikip...,0


Creating a subset of the data to test the GPT-3 model

In [None]:
messages = data['comment_text'][0:1000].to_list()

## Classify the messages using GPT-3.5 turbo

Importing the chat_GPT class and creating a chat object to classify the messages. We used chunks of 10 messages to reduce the number tokens per request to the API.

In [None]:
responses = []
chat = Chat()

for i in tqdm(range(0, len(messages), 10)):
    chunk = messages[i:i+10]
    responses += chat.batch_create_chats(chunk)

The process has been interrupted by the api after 800 messages.

Now save the responses to a local csv file.

In [None]:
data = data[0:800]
data.loc[:, 'gpt_label'] = responses
data['gpt_label'] = data['gpt_label'].astype(int)
data.to_csv("../data/test/800_test_labeled_gpt.csv", index=False)

Calculate the F1 Score for the GPT-3 labels

In [None]:
from sklearn.metrics import f1_score

f1_score(data['label'], data['gpt_label'], average='weighted')

0.8215979843308009

The F1Score is 0.82 which is a relatively good score. Now we'll try to outperform this score using a our BERT transformer model.

In [None]:
from google.colab import drive
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
from transformers import BertForSequenceClassification
import numpy as np
import os  # Importiert das OS-Modul
import pandas as pd  # Für das Einlesen der CSV-Datei
from transformers import BertTokenizer
from tqdm import tqdm

# Google Drive einbinden
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/'
test_data_path = os.path.join(base_path, 'NLP/hate_speech_detection_pipeline/data/test/800_test_labeled_gpt.csv')
model_path = os.path.join(base_path, 'NLP/hate_speech_detection_pipeline/model/model2.pth')  # Pfad zum Modell aktualisiert

# Zuerst das Modell initialisieren (stellen Sie sicher, dass die Modellklasse importiert oder definiert ist)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()

# Testdaten laden
df = pd.read_csv(test_data_path)

# Tokenisierung der Testdaten
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_data = tokenizer(list(df['comment_text']), padding=True, truncation=True, return_tensors="pt")

# Labels in Tensor umwandeln
labels = torch.tensor(df['label'].values)

# Erstellen des DataLoader
test_dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Modell evaluieren
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        # Extrahieren der Daten aus dem Batch
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to('cpu'), masks.to('cpu'), labels.to('cpu')

        # Modellvorhersage
        outputs = model(inputs, attention_mask=masks)

        # Vorhersagen extrahieren
        _, predicted = torch.max(outputs.logits, dim=1)

        # Ergebnisse sammeln
        all_preds.extend(predicted.tolist())
        all_labels.extend(labels.tolist())

# F1-Score berechnen
f1 = f1_score(all_labels, all_preds, average='weighted')
print(f'F1 Score: {f1}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   0%|          | 0/13 [00:00<?, ?it/s]