In [11]:
import json
import os
import pickle as pkl
import random
import statistics
import time
import xgboost as xgb

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn import metrics
from sklearn.svm import SVC
from tqdm import tqdm
from xgboost import XGBClassifier
import fasttext

from prompt_classifier.modeling.dspy_gpt import GPT4oMini
from prompt_classifier.modeling.fasttext import FastTextClassifier
from prompt_classifier.modeling.nli_modernbert import ModernBERTNLI

load_dotenv()
random.seed(1)

ModuleNotFoundError: No module named 'seaborn'

In [2]:
general_prompts = pd.read_csv("data/processed/general_prompts.csv")
finance_prompts = pd.read_csv("data/processed/finance_prompts.csv")

finance_dataset = (
    pd.concat([finance_prompts, general_prompts])
    .sample(frac=1)
    .reset_index(drop=True)
)

general_prompts_interim = pd.read_csv("data/interim/general_prompts.csv")
finance_prompts_interim = pd.read_csv("data/interim/finance_prompts.csv")

finance_dataset_interim = (
    pd.concat([finance_prompts_interim, general_prompts_interim])
    .sample(frac=1)
    .reset_index(drop=True)
)

baai_embedding = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    providers=["CUDAExecutionProvider"]
)

[0;93m2025-03-16 18:44:20.101159776 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-03-16 18:44:20.101202116 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [None]:
# Split data
train_data = finance_dataset_interim.sample(n=800)
test_data = finance_dataset_interim.drop(train_data.index).sample(n=4000)

# GPT Classifier
gpt_classifier = GPT4oMini(
    api_key=os.getenv("OPENAI_API_KEY"),
    proxy_url=os.getenv("PROXY_URL"),
    model_name="gpt-4o-mini",
    domain="finance",
    train_data=train_data,
    test_data=test_data,
)

try:
    gpt_classifier.load_model("models/gpt-4o-mini-finance.json")

    test_predictions, test_actuals, test_latency = gpt_classifier.predict()
    print(f"Test Accuracy: {metrics.accuracy_score(test_actuals, test_predictions)}")

except Exception as e:
    print(f"Error running GPT model: {e}")


try:
    test_data = finance_dataset_interim.sample(n=30_000)
    # ModernBERT Classifier
    bert_classifier = ModernBERTNLI(domain="finance")
    bert_classifier.classifier.model.to("cuda")

    # Test predictions
    test_predictions = []
    test_times = []
    for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
        start_time = time.perf_counter_ns()
        pred = bert_classifier.predict(row["prompt"])
        test_predictions.append(pred)
        test_times.append(time.perf_counter_ns() - start_time)

    test_acc = metrics.accuracy_score(test_data["label"], test_predictions)

except Exception as e:
    print(f"Error running ModernBERT model: {e}")

In [3]:
train_data = finance_dataset.sample(frac=0.7).reset_index(drop=True)
test_data = finance_dataset.drop(train_data.index).reset_index(drop=True)

In [13]:
actuals_ft = []
predictions_ft = []
prediction_times_ft = []

# fastText
fasttext_classifier = FastTextClassifier(train_data=train_data, test_data=test_data)
fasttext_classifier.model = fasttext.load_model("models/fastText_finance_fasttext.bin")

for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
    text = str(row["prompt"])
    query = text.replace("\n", "")

    start_time = time.perf_counter_ns()
    prediction = fasttext_classifier.model.predict(query)
    end_time = time.perf_counter_ns()

    prediction_times_ft.append(end_time - start_time)

    if prediction[0][0] == "__label__1":
        predictions_ft.append(1)
    else:
        predictions_ft.append(0)

    actuals_ft.append(row["label"])

100%|██████████| 9000/9000 [00:00<00:00, 18818.39it/s]


In [16]:

# Embedding test data
start_time = time.perf_counter_ns()
test_embeds = np.array(list(baai_embedding.embed(test_data["prompt"])))
end_time = time.perf_counter_ns()
embed_times = end_time - start_time

mean_embed_time = embed_times / len(train_data + test_data)

with open("models/SVM_finance_baai.pkl", "rb") as svm_file:
    svm_classifier = pkl.load(svm_file)

xgb_classifier = XGBClassifier()
xgb_classifier.load_model("models/XGBoost_finance_baai.json")

predictions_xgb = []
predictions_svm = []

prediction_times_xgb = []
prediction_times_svm = []

for _, test_embed in enumerate(test_embeds):
    start_time = time.perf_counter_ns()
    prediction = svm_classifier.predict(test_embed.reshape(1, -1))
    end_time = time.perf_counter_ns()

    prediction_times_svm.append(end_time - start_time)
    predictions_svm.append(prediction[0])


for _, test_embed in enumerate(test_embeds):
    start_time = time.perf_counter_ns()
    prediction = xgb_classifier.predict(test_embed.reshape(1, -1))
    end_time = time.perf_counter_ns()

    prediction_times_xgb.append(end_time - start_time)
    predictions_xgb.append(prediction[0])

In [None]:
times_ms = {
    'fastText': np.array(prediction_times_ft) / 1_000_000,
    'XGBoost': np.array(prediction_times_xgb) / 1_000_000,
    'SVM': np.array(prediction_times_svm) / 1_000_000
}

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'Model': [k for k,v in times_ms.items() for _ in v],
    'Latency (ms)': [x for v in times_ms.values() for x in v]
})

# Create boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=plot_data, x='Model', y='Latency (ms)', 
            order=[x[0] for x in sorted(zip(times_ms.keys(), 
                   [np.mean(v) for v in times_ms.values()]), 
                   key=lambda x: x[1])])

plt.title('Model Latency Comparison')
plt.yscale('log')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print average latencies
for model, times in times_ms.items():
    print(f"{model} average latency: {times.mean():.2f} ms")