# Classification Pipeline

Importing necessary modules

In [1]:
from apiGPT import Chat
from distilBERT import DistilBERTModelClassificator
from BERT import BERTModelClassificator
import pandas as pd
from tqdm.auto import tqdm

Import test data and creating a subset of the data to test the GPT-3 model

In [6]:
data = pd.read_csv("data/test/balanced_test.csv")
test_data_sample = data[0:1000]

## Classify the messages using GPT-3.5 turbo with Zero-Shot Classification

Importing the chat_GPT class and creating a chat object to classify the messages. We used chunks of 10 messages to reduce the number tokens per request to the API.

In [2]:
chat = Chat()

tqdm.pandas(desc="Classifiying messages")
test_data_sample['gpt_zeroshot'] = test_data_sample['comment_text'].apply(lambda x: chat.create_chat(x))

We had 5 messages that could not be classified by the GPT-3.5 turbo model. We will remove these messages from the test data.

In [2]:
test_data_sample['gpt_zeroshot'] = test_data_sample['gpt_zeroshot'].astype(int)
test_data_sample = test_data_sample[test_data_sample['gpt_zeroshot'] >= 0]

Now save the responses to a local csv file.

In [32]:
test_data_sample.to_csv("data/test/data_labeled_zeroshot.csv", index=False)

GPTNow we'll try to outperform this score using a our BERT transformer model.

## Classify the messages using GPT-3.5 turbo with Few-Shot Classification

Importing the chat_GPT class and creating a chat object to classify the messages. We used chunks of 10 messages to reduce the number tokens per request to the API.

The prompt will be enriched wich examples of toxic and non-toxic comments to improve the classification.

In [10]:
test_data_sample = pd.read_csv("results/data_labeled_distilBERT_GPTzeroshot.csv")

In [11]:
chat = Chat()
tqdm.pandas(desc="Classifiying messages")
test_data_sample['gpt_fewshot'] = test_data_sample['comment_text'].apply(lambda x: chat.create_chat(x))

In [12]:
test_data_sample['gpt_fewshot'] = test_data_sample['gpt_fewshot'].astype(int)
test_data_sample = test_data_sample[test_data_sample['gpt_fewshot'] >= 0]

In [13]:
len(test_data_sample)

994

In [14]:
test_data_sample.to_csv("results/data_labeled_distilBERT_GPTzeroshot_GPTfewshot.csv", index=False)

## Classify the messages using our finetuned DistilBERT model

In [4]:
distbert = DistilBERTModelClassificator("model/model_distil.pth")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tqdm.pandas(desc="Classifiying messages")
test_data_sample['distilBERT'] = test_data_sample['comment_text'].apply(lambda x: bert.predict_label(x))

In [47]:
test_data_sample.to_csv("results/data_labeled_distilBERT_GPTzeroshot.csv", index=False)

## Classify the messages using our finetuned BERT model

In [8]:
data = pd.read_csv("results/data_labeled_distilBERT_GPTzeroshot_GPTfewshot.csv")

In [6]:
bert = BERTModelClassificator("model/model2.pth")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tqdm.pandas(desc="Classifiying messages")
data['BERT'] = data['comment_text'].apply(lambda x: bert.predict_label(x))

## Evaluating different outputs

Define function to calculate the accuracy, recall, precision and F1 score.

In [10]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

def get_scores(data, column):
    accuracy = accuracy_score(data['label'], data[column])
    # Setze zero_division=0 für Recall
    recall = recall_score(data['label'], data[column], average='macro', zero_division=0)
    precision = precision_score(data['label'], data[column], average='macro')
    f1 = f1_score(data['label'], data[column], average='macro')
    return accuracy, recall, precision, f1

### GPT-3.5 turbo Zero-Shot Classification

In [15]:
accuracy3, recall3, precision3, f1_3 = get_scores(data, 'gpt_zeroshot')

print('GPT-3.5 turbo Zero-Shot Classification')
print(f'Accuracy: {accuracy3}')
print(f'Recall: {recall3}')
print(f'Precision: {precision3}')
print(f'F1 Score: {f1_3}')

GPT-3.5 turbo Zero-Shot Classification
Accuracy: 0.8329979879275654
Recall: 0.8353518158236057
Precision: 0.8394240236297998
F1 Score: 0.8327271105271908


### GPT-3.5 turbo few-Shot Classification

In [16]:
accuracy2, recall2, precision2, f1_2 = get_scores(data, 'gpt_fewshot')

print('GPT-3.5 turbo Few-Shot Classification')
print(f'Accuracy: {accuracy2}')
print(f'Recall: {recall2}')
print(f'Precision: {precision2}')
print(f'F1 Score: {f1_2}')

GPT-3.5 turbo Few-Shot Classification
Accuracy: 0.8350100603621731
Recall: 0.836401588845655
Precision: 0.837155926034253
F1 Score: 0.8349860106165451


### fine tuned DistilBERT

In [18]:
accuracy1, recall1, precision1, f1_1 = get_scores(data, 'distilBERT')

print('fine tuned DistilBERT')
print(f'Accuracy: {accuracy1}')
print(f'Recall: {recall1}')
print(f'Precision: {precision1}')
print(f'F1 Score: {f1_1}')

fine tuned DistilBERT
Accuracy: 0.8983903420523138
Recall: 0.9009930285343709
Precision: 0.9068554273846976
F1 Score: 0.898181663287732


### fine tuned BERT

In [20]:
accuracy, recall, precision, f1 = get_scores(data, 'BERT')

print('fine tuned BERT')
print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')

fine tuned BERT
Accuracy: 0.903420523138833
Recall: 0.9061324578469521
Precision: 0.9127166534862264
F1 Score: 0.9031947831662155


In [21]:
from tabulate import tabulate
from colorama import Fore, Style

# Farbcodes definieren
def colorize(value):
    if isinstance(value, float):
        if value >= 0.89:
            return Fore.GREEN + str(round(value, 4)) + Style.RESET_ALL
        elif value >= 0.75:
            return Fore.YELLOW + str(round(value, 4)) + Style.RESET_ALL
        else:
            return Fore.RED + str(round(value, 4)) + Style.RESET_ALL
    else:
        return value

# Ergebnisse der verschiedenen Modelle
results = [
    ['fine tuned BERT', accuracy, recall, precision, f1],
    ['GPT-3.5 turbo Zero-Shot', accuracy3, recall3, precision3, f1_3],
    ['GPT-3.5 turbo Few-Shot', accuracy2, recall2, precision2, f1_2]
]

# Spaltenüberschriften
headers = ["Model", "Accuracy", "Recall", "Precision", "F1 Score"]

# Farbcodierte Ergebnisse
colorized_results = [[colorize(value) for value in row] for row in results]

# Tabelle erstellen und anzeigen
table = tabulate(colorized_results, headers=headers, tablefmt="fancy_grid")
print(table)

╒═════════════════════════╤════════════╤══════════╤═════════════╤════════════╕
│ Model                   │   Accuracy │   Recall │   Precision │   F1 Score │
╞═════════════════════════╪════════════╪══════════╪═════════════╪════════════╡
│ fine tuned BERT         │     [32m0.9034[0m │   [32m0.9061[0m │      [32m0.9127[0m │     [32m0.9032[0m │
├─────────────────────────┼────────────┼──────────┼─────────────┼────────────┤
│ GPT-3.5 turbo Zero-Shot │     [33m0.833[0m  │   [33m0.8354[0m │      [33m0.8394[0m │     [33m0.8327[0m │
├─────────────────────────┼────────────┼──────────┼─────────────┼────────────┤
│ GPT-3.5 turbo Few-Shot  │     [33m0.835[0m  │   [33m0.8364[0m │      [33m0.8372[0m │     [33m0.835[0m  │
╘═════════════════════════╧════════════╧══════════╧═════════════╧════════════╛
