In [2]:
import os
import pickle as pkl
import random
import time

import fasttext
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn import metrics
from tqdm import tqdm
from xgboost import XGBClassifier

from prompt_classifier.modeling.dspy_llm import LlmClassifier
from prompt_classifier.modeling.fasttext import FastTextClassifier
from prompt_classifier.modeling.nli_modernbert import ModernBERTNLI

load_dotenv()
random.seed(1)

In [3]:
splits = {'train': 'train_dataset.csv', 'validation': 'val_dataset.csv', 'test': 'test_dataset.csv'}
inference_df = pd.read_csv("hf://datasets/Arsive/toxicity_classification_jigsaw/" + splits["validation"])
inference_df = inference_df[(inference_df["toxic"] == 1) | 
                            (inference_df["severe_toxic"] == 1) | 
                            (inference_df["obscene"] == 1) | 
                            (inference_df["threat"] == 1) | 
                            (inference_df["insult"] == 1) | 
                            (inference_df["identity_hate"] == 1)]
inference_df = inference_df.rename(columns={"comment_text": "prompt"})
inference_df["label"] = 0

In [4]:
baai_embedding = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    providers=["CUDAExecutionProvider"]
)

[0;93m2025-03-28 20:31:13.813905722 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-03-28 20:31:13.813942844 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [None]:
# GPT Classifier
llm_classifier = LlmClassifier(
    api_key=os.getenv("OPENAI_API_KEY"),
    proxy_url=os.getenv("PROXY_URL"),
    model_name="gpt-4o-mini",
    domain="finance",
    train_data=inference_df,
    test_data=inference_df,
)

try:
    llm_classifier.load_model("models/gpt-4o-mini-finance.json")

    test_predictions, test_actuals, test_latency = llm_classifier.predict()
    print(f"Test Accuracy: {metrics.accuracy_score(test_actuals, test_predictions)}")

except Exception as e:
    print(f"Error running GPT model: {e}")

In [5]:
try:
    # ModernBERT Classifier
    bert_classifier = ModernBERTNLI(domain="finance")
    bert_classifier.classifier.model.to("cuda")

    # Test predictions
    test_predictions = []
    test_times = []
    for _, row in tqdm(inference_df.iterrows(), total=len(inference_df)):
        start_time = time.perf_counter_ns()
        pred = bert_classifier.predict(row["prompt"])
        test_predictions.append(pred)
        test_times.append(time.perf_counter_ns() - start_time)

    test_acc = metrics.accuracy_score(inference_df["label"], test_predictions)

except Exception as e:
    print(f"Error running ModernBERT model: {e}")

Error running ModernBERT model: Failed to import transformers.models.modernbert.modeling_modernbert because of the following error (look up to see its traceback):
partially initialized module 'torch._dynamo' has no attribute 'external_utils' (most likely due to a circular import)


In [6]:
actuals_ft = []
predictions_ft = []
prediction_times_ft = []

# fastText
fasttext_classifier = FastTextClassifier(train_data=inference_df, test_data=inference_df)
fasttext_classifier.model = fasttext.load_model("models/fastText_finance_fasttext.bin")

for _, row in tqdm(inference_df.iterrows(), total=len(inference_df)):
    text = str(row["prompt"])
    query = text.replace("\n", "")

    start_time = time.perf_counter_ns()
    prediction = fasttext_classifier.model.predict(query)
    end_time = time.perf_counter_ns()

    prediction_times_ft.append(end_time - start_time)

    if prediction[0][0] == "__label__1":
        predictions_ft.append(1)
    else:
        predictions_ft.append(0)

    actuals_ft.append(row["label"])

100%|██████████| 3214/3214 [00:00<00:00, 6290.89it/s]


In [7]:

# Embedding test data
start_time = time.perf_counter_ns()
test_embeds = np.array(list(baai_embedding.embed(inference_df["prompt"])))
end_time = time.perf_counter_ns()
embed_times = end_time - start_time

mean_embed_time = embed_times / len(inference_df)

with open("models/SVM_finance_baai.pkl", "rb") as svm_file:
    svm_classifier = pkl.load(svm_file)

xgb_classifier = XGBClassifier()
xgb_classifier.load_model("models/XGBoost_finance_baai.json")

predictions_xgb = []
predictions_svm = []

prediction_times_xgb = []
prediction_times_svm = []

for _, test_embed in enumerate(test_embeds):
    start_time = time.perf_counter_ns()
    prediction = svm_classifier.predict(test_embed.reshape(1, -1))
    end_time = time.perf_counter_ns()

    prediction_times_svm.append(end_time - start_time)
    predictions_svm.append(prediction[0])


for _, test_embed in enumerate(test_embeds):
    start_time = time.perf_counter_ns()
    prediction = xgb_classifier.predict(test_embed.reshape(1, -1))
    end_time = time.perf_counter_ns()

    prediction_times_xgb.append(end_time - start_time)
    predictions_xgb.append(prediction[0])

In [9]:
times_ms = {
    'fastText': np.array(prediction_times_ft) / 1_000_000,
    'XGBoost': np.array(prediction_times_xgb) / 1_000_000,
    'SVM': np.array(prediction_times_svm) / 1_000_000
}

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'Model': [k for k,v in times_ms.items() for _ in v],
    'Latency (ms)': [x for v in times_ms.values() for x in v]
})

In [13]:
# Calculate accuracy for each model
accuracy_ft = metrics.accuracy_score(actuals_ft, predictions_ft)
accuracy_svm = metrics.accuracy_score(actuals_ft, predictions_svm)
accuracy_xgb = metrics.accuracy_score(actuals_ft, predictions_xgb)

# Print the accuracies
print(f"fastText Accuracy: {accuracy_ft}")
print(f"SVM Accuracy: {accuracy_svm}")
print(f"XGBoost Accuracy: {accuracy_xgb}")

fastText Accuracy: 0.7078406969508401
SVM Accuracy: 0.9499066583696328
XGBoost Accuracy: 0.924082140634723
