In [None]:
from chat_GPT import Chat
import pandas as pd
from tqdm.auto import tqdm

Importing the test data

In [None]:
data = pd.read_csv("../data/test/clean_test.csv")
data.head(10)

Unnamed: 0,id,comment_text,label
0,0001ea8717f6de06,Thank you for understanding I think very highl...,0
1,000247e83dcc1211,Dear god this site is horrible,0
2,0002f87b16116a7f,Somebody will invariably try to add Religion ...,0
3,0003e1cccfd5a40a,It says it right there that it IS a type T...,0
4,00059ace3e3e9a53,Before adding a new product to the list m...,0
5,000663aff0fffc80,this other one from,0
6,000689dd34e20979,Reason for banning throwing This article ...,0
7,000844b52dee5f3f,blocked from editing Wikipedia,0
8,00091c35fa9d0465,Arabs are committing genocide in Iraq but no ...,1
9,000968ce11f5ee34,Please stop If you continue to vandalize Wikip...,0


Creating a subset of the data to test the GPT-3 model

In [None]:
messages = data['comment_text'][0:1000].to_list()

## Classify the messages using GPT-3.5 turbo

Importing the chat_GPT class and creating a chat object to classify the messages. We used chunks of 10 messages to reduce the number tokens per request to the API.

In [None]:
responses = []
chat = Chat()

for i in tqdm(range(0, len(messages), 10)):
    chunk = messages[i:i+10]
    responses += chat.batch_create_chats(chunk)

The process has been interrupted by the api after 800 messages.

Now save the responses to a local csv file.

In [None]:
data = data[0:800]
data.loc[:, 'gpt_label'] = responses
data['gpt_label'] = data['gpt_label'].astype(int)
data.to_csv("../data/test/800_test_labeled_gpt.csv", index=False)

Calculate the F1 Score for the GPT-3 labels

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score


def get_f1_score(data):
  return f1_score(data['label'], data['gpt_label'], average='weighted')

0.8215979843308009

The F1Score is 0.82 which is a relatively good score. Now we'll try to outperform this score using a our BERT transformer model.

In [13]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

def evaluate_model(model_path, test_data_path, device='cpu'):
    """
    Evaluates a DistilBERT model for sequence classification using the specified test data path.

    Args:
    model_path (str): Path to the model checkpoint file.
    test_data_path (str): Path to the test data CSV file.
    device (str, optional): Device to evaluate the model on. Defaults to 'cpu'.

    Returns:
    tuple: The macro-averaged accuracy, recall, precision, and F1 score of the model predictions.
    """
    # Load model
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
    model.to(device)
    model.eval()

    # Load and prepare test data
    df = pd.read_csv(test_data_path)
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    tokenized_data = tokenizer(list(df['comment_text']), padding=True, truncation=True, return_tensors="pt")
    labels = torch.tensor(df['label'].values)
    test_dataset = TensorDataset(tokenized_data['input_ids'], tokenized_data['attention_mask'], labels)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    # Evaluate model
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

            outputs = model(inputs, attention_mask=masks)
            _, predicted = torch.max(outputs.logits, dim=1)

            all_preds.extend(predicted.tolist())
            all_labels.extend(labels.tolist())

    # Calculate macro-averaged metrics
    accuracy = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='macro')
    precision = precision_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    return accuracy, recall, precision, f1

DistilBERT Classification: F1 Score: 0.8702393637283818

In [15]:
import os
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/'
test_data_path = os.path.join(base_path, 'NLP/hate_speech_detection_pipeline/data/test/800_test_labeled_gpt.csv')
model_path = os.path.join(base_path, 'NLP/hate_speech_detection_pipeline/model/model_distil.pth')

# Calculate metrics and output them
accuracy, recall, precision, f1 = evaluate_model(model_path, test_data_path)

print("\n")
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 13/13 [00:12<00:00,  1.06it/s]


Accuracy: 0.8425
Recall: 0.9070611269781395
Precision: 0.6882200018272577
F1 Score: 0.7257262268853601



