In [1]:
import os
import pickle as pkl
import random
import time
from functools import partial

import fasttext
import numpy as np
import pandas as pd
import onnxruntime as ort

from dotenv import load_dotenv
from fastembed import TextEmbedding
from tqdm import tqdm
from xgboost import XGBClassifier
from transformers import AutoTokenizer

from prompt_classifier.modeling.fasttext import FastTextClassifier

load_dotenv()
random.seed(22)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Load Test Datasets

Load various hate speech datasets for evaluation:
- Jigsaw Toxicity
- OLID
- HateXplain
- TUKE Slovak

In [2]:
domain_data = pd.read_csv("data/domain_eval.csv")
ood_data = pd.read_csv("data/ood_eval.csv")

In [3]:
baai_embedding = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5", 
    providers=["CUDAExecutionProvider"]
)

mini_embedding = TextEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    providers=["CUDAExecutionProvider"],
)

# TF-IDF
tfidf_finance = pkl.load(open("models/tfidf_finance.pkl", "rb"))
tfidf_healthcare = pkl.load(open("models/tfidf_healthcare.pkl", "rb"))
tfidf_law = pkl.load(open("models/tfidf_law.pkl", "rb"))

[0;93m2025-04-21 10:15:29.693512774 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-04-21 10:15:29.693564931 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m
[0;93m2025-04-21 10:15:30.261584792 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-04-21 10:15:30.261627592 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [None]:
batch_sizes = [1, 32, 64, 128, 256]

In [4]:
def save_predictions(predictions, dataset_name, model_name, embed_type=None, domain=None):
    filename = f'data/{dataset_name}_eval.csv'
    df = pd.read_csv(filename)
    
    # Create column name with model_embed format
    if embed_type:
        model_part = f'{model_name}_{embed_type}'
    else:
        model_part = model_name
    
    # Add domain with hyphen separator
    col_name = f'pred-{model_part}-{domain}' if domain else f'pred-{model_part}'
    
    df[col_name] = predictions
    df.to_csv(filename, index=False)

# OOD

# FastText

In [5]:
# FastText inference
for dataset_name, inference_df in {'ood': ood_data}.items():
    predictions_ft = []
    print(f"Processing {dataset_name} dataset...")
    try:
        actuals_ft = []
        prediction_times_ft = []
        print(f"Processing dataset {dataset_name}...")
        try:
            # Try to load model with proper error handling
            try:
                fasttext_classifier_finance = FastTextClassifier(
                    train_data=inference_df, test_data=inference_df
                )
                fasttext_classifier_finance.model = fasttext.load_model(
                    "models/fastText_finance_fasttext.bin"
                )

                fasttext_classifier_healthcare = FastTextClassifier(
                    train_data=inference_df, test_data=inference_df
                )
                fasttext_classifier_healthcare.model = fasttext.load_model(
                    "models/fastText_healthcare_fasttext.bin"
                )

                fasttext_classifier_law = FastTextClassifier(
                    train_data=inference_df, test_data=inference_df
                )
                fasttext_classifier_law.model = fasttext.load_model(
                    "models/fastText_law_fasttext.bin"
                )
            except Exception as e:
                print(f"Error loading fastText models: {e}")
                continue

            for _, row in tqdm(inference_df.iterrows(), total=len(inference_df)):
                text = str(row["prompt"])
                query = text.replace("\n", "")

                try:
                    start_time = time.perf_counter_ns()

                    # Predictions from all three classifiers
                    prediction_finance = fasttext_classifier_finance.model.predict(query)
                    prediction_healthcare = fasttext_classifier_healthcare.model.predict(
                        query
                    )
                    prediction_law = fasttext_classifier_law.model.predict(query)

                    end_time = time.perf_counter_ns()
                    prediction_times_ft.append(end_time - start_time)

                    predictions_ft.append(
                        0
                        if (
                            prediction_finance[0][0] == "__label__1"
                            or prediction_healthcare[0][0] == "__label__1"
                            or prediction_law[0][0] == "__label__1"
                        )
                        else 1
                    )
                    actuals_ft.append(row["label"])
                except Exception as e:
                    print(f"Error processing row: {e}")
                    continue


            save_predictions(predictions_ft, dataset_name, 'fasttext')
        except Exception as e:
            print(f"Error loading fastText models: {e}")
            continue
    except Exception as e:
        print(f"Error processing dataset {dataset_name}: {e}")

Processing domain dataset...
Processing dataset domain...


100%|██████████| 9000/9000 [00:03<00:00, 2840.14it/s]



Processing ood dataset...
Processing dataset ood...


100%|██████████| 13457/13457 [00:02<00:00, 6516.84it/s]
100%|██████████| 13457/13457 [00:02<00:00, 6516.84it/s]


# ML - SVM, XGB

In [6]:
embedding_models_names = ["mini", "baai", "tf_idf"]

In [7]:
def get_embeddings(embedding_model, texts):
    if embedding_model == "tf_idf":
        return tfidf_finance.transform(texts)
    elif embedding_model == "mini":
        return np.array(list(mini_embedding.embed(texts)))
    else:  # baai
        return np.array(list(baai_embedding.embed(texts)))

def predict_batch(texts, embedding_model, classifiers, batch_size):
    predictions = []
    embeddings = get_embeddings(embedding_model, texts)
    
    for i in range(0, len(texts), batch_size):
        batch_embeds = embeddings[i:i + batch_size]
        
        # Get predictions from all three domains
        preds = [clf.predict(batch_embeds) for clf in classifiers]
        
        # Combine predictions for the batch
        batch_preds = np.zeros(batch_embeds.shape[0])
        batch_preds[(preds[0] == 1) | (preds[1] == 1) | (preds[2] == 1)] = 0
        batch_preds[(preds[0] != 1) & (preds[1] != 1) & (preds[2] != 1)] = 1
        
        predictions.extend(batch_preds)
    
    return predictions

# ML Models inference
for embedding_model in embedding_models_names:
    for dataset_name, inference_df in {'domain': domain_data, 'ood': ood_data}.items():
        print(f"Processing {dataset_name} dataset with {embedding_model} embeddings...")
        try:
            # Load SVM models
            svm_classifiers = [
                pkl.load(open(f"models/SVM_{domain}_{embedding_model}.pkl", "rb"))
                for domain in ["finance", "healthcare", "law"]
            ]
            
            # Load XGBoost models
            xgb_classifiers = []
            for domain in ["finance", "healthcare", "law"]:
                clf = XGBClassifier()
                clf.load_model(f"models/XGBoost_{domain}_{embedding_model}.json")
                xgb_classifiers.append(clf)
            
            # Get predictions using batch processing
            predictions_svm = predict_batch(
                inference_df["prompt"].tolist(), 
                embedding_model, 
                svm_classifiers, 
                batch_size=128
            )
            
            predictions_xgb = predict_batch(
                inference_df["prompt"].tolist(), 
                embedding_model, 
                xgb_classifiers, 
                batch_size=128
            )
            
            # Save predictions
            save_predictions(predictions_svm, dataset_name, 'svm', embedding_model)
            save_predictions(predictions_xgb, dataset_name, 'xgb', embedding_model)

        except Exception as e:
            print(f"Error processing {dataset_name} dataset with {embedding_model} embeddings: {e}")


Processing domain dataset with mini embeddings...
Processing ood dataset with mini embeddings...
Processing ood dataset with mini embeddings...
Processing domain dataset with baai embeddings...
Processing domain dataset with baai embeddings...
Processing ood dataset with baai embeddings...
Processing ood dataset with baai embeddings...
Processing domain dataset with tf_idf embeddings...
Processing domain dataset with tf_idf embeddings...
Processing ood dataset with tf_idf embeddings...
Processing ood dataset with tf_idf embeddings...


# Tibor

In [8]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [9]:
# Initialize tokenizer function for batch processing
tokenizer_func = partial(
    tokenizer, padding=True, truncation=True, return_tensors="pt", max_length=512
)

In [10]:
try:
    # Load ONNX models
    mlp_classifier = ort.InferenceSession(
        "models/text_classifier_optimized_int8.onnx",
        providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
    )
    print("Successfully loaded all ONNX models")

except Exception as e:
    print(f"Error loading ONNX models: {e}")

Successfully loaded all ONNX models


[0;93m2025-04-21 10:22:50.388985667 [W:onnxruntime:, transformer_memcpy.cc:74 ApplyImpl] 141 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m


In [22]:
domain_data = pd.read_csv("data/domain_eval.csv")
ood_data = pd.read_csv("data/ood_eval.csv")

data = {"domain": domain_data}

In [24]:
for domain, inference_df in data.items():
    print(f"\nProcessing {domain} dataset...")
    predictions_mlp = []
    prediction_times_mlp = []
    actuals_mlp = []

    try:
        for _, row in tqdm(inference_df.iterrows(), total=len(inference_df)):
            # Tokenize input text
            start_time = time.perf_counter_ns()
            inputs = tokenizer_func(row["prompt"])

            # Convert to numpy arrays for ONNX
            onnx_inputs = {
                'input_ids': inputs['input_ids'].numpy(),
                'attention_mask': inputs['attention_mask'].numpy(),
            }

            # Run inference
            pred = mlp_classifier.run(None, onnx_inputs)[0]
            end_time = time.perf_counter_ns()
            prediction_times_mlp.append(end_time - start_time)
            predictions_mlp.append(np.argmax(pred))
            

        save_predictions(predictions_mlp, domain, 'mlp_onnx')

    except Exception as e:
        print(f"Error processing {domain} dataset: {e}")
        continue



Processing domain dataset...


100%|██████████| 9000/9000 [06:00<00:00, 25.00it/s]



# Domain

In [21]:
# FastText domain inference
predictions_ft = []
for domain, inference_df in {'domain': domain_data}.items():
    print(f"Processing {domain} dataset...")
    try:
        actuals_ft = []
        prediction_times_ft = []
        
        # Load domain-specific models
        fasttext_classifier_finance = FastTextClassifier(train_data=inference_df, test_data=inference_df)
        fasttext_classifier_finance.model = fasttext.load_model("models/fastText_finance_fasttext.bin")

        fasttext_classifier_healthcare = FastTextClassifier(train_data=inference_df, test_data=inference_df)
        fasttext_classifier_healthcare.model = fasttext.load_model("models/fastText_healthcare_fasttext.bin")
        
        fasttext_classifier_law = FastTextClassifier(train_data=inference_df, test_data=inference_df)
        fasttext_classifier_law.model = fasttext.load_model("models/fastText_law_fasttext.bin")

        for _, row in tqdm(inference_df.iterrows(), total=len(inference_df)):
            text = str(row["prompt"])
            query = text.replace("\n", "")

            try:
                start_time = time.perf_counter_ns()
                
                # Get predictions for each domain
                prediction_finance = fasttext_classifier_finance.model.predict(query)
                prediction_healthcare = fasttext_classifier_healthcare.model.predict(query)
                prediction_law = fasttext_classifier_law.model.predict(query)
                
                end_time = time.perf_counter_ns()
                prediction_times_ft.append(end_time - start_time)

                # Store predictions by domain
                predictions_ft.append({
                    'finance': 1 if prediction_finance[0][0] == "__label__1" else 0,
                    'healthcare': 1 if prediction_healthcare[0][0] == "__label__1" else 0,
                    'law': 1 if prediction_law[0][0] == "__label__1" else 0
                })
                actuals_ft.append(row["label"])
                
            except Exception as e:
                print(f"Error processing row: {e}")
                continue

        # Save predictions with domain suffixes
        for domain_name in ['finance', 'healthcare', 'law']:
            domain_preds = [p[domain_name] for p in predictions_ft]
            save_predictions(domain_preds, 'domain', 'fasttext', domain=domain_name)

    except Exception as e:
        print(f"Error processing dataset: {e}")
        continue

# ML Models domain inference 
for embedding_model in embedding_models_names:
    print(f"Processing domain dataset with {embedding_model} embeddings...")
    predictions_xgb = []
    predictions_svm = []

    try:
        # Get embeddings once per dataset
        if embedding_model == "tf_idf":
            test_embeds = tfidf_finance.transform(domain_data["prompt"])
        else:
            if embedding_model == "mini":
                test_embeds = np.array(list(mini_embedding.embed(domain_data["prompt"])))
            else:  # baai
                test_embeds = np.array(list(baai_embedding.embed(domain_data["prompt"])))

        # Load domain-specific models
        for model_domain in ['finance', 'healthcare', 'law']:
            # Load SVM models
            with open(f"models/SVM_{model_domain}_{embedding_model}.pkl", "rb") as f:
                svm_model = pkl.load(f)

            # Load XGBoost models
            xgb_model = XGBClassifier()
            xgb_model.load_model(f"models/XGBoost_{model_domain}_{embedding_model}.json")

            # Get predictions
            svm_preds = svm_model.predict(test_embeds)
            xgb_preds = xgb_model.predict(test_embeds)

            # Save predictions with domain suffixes
            save_predictions(svm_preds, 'domain', 'svm', embedding_model, domain=model_domain)
            save_predictions(xgb_preds, 'domain', 'xgb', embedding_model, domain=model_domain)

    except Exception as e:
        print(f"Error processing embeddings {embedding_model}: {e}")
        continue

Processing domain dataset...


100%|██████████| 9000/9000 [00:02<00:00, 3078.62it/s]



Processing domain dataset with mini embeddings...
Processing domain dataset with baai embeddings...
Processing domain dataset with baai embeddings...
Processing domain dataset with tf_idf embeddings...
Processing domain dataset with tf_idf embeddings...
