In [1]:
import json
import os
import pickle as pkl
import random
import statistics
import time

import joblib
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn import metrics
from sklearn.svm import SVC
from tqdm import tqdm
from xgboost import XGBClassifier

from prompt_classifier.modeling.dspy_gpt import GPT4oMini
from prompt_classifier.modeling.fasttext import FastTextClassifier
from prompt_classifier.modeling.nli_modernbert import ModernBERTNLI

load_dotenv()
random.seed(1)

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'prompt_classifier'

In [None]:
general_prompts = pd.read_csv("data/processed/general_prompts.csv")
finance_prompts = pd.read_csv("data/processed/finance_prompts.csv")

finance_dataset = (
    pd.concat([finance_prompts, general_prompts])
    .sample(frac=1)
    .reset_index(drop=True)
)

general_prompts_interim = pd.read_csv("data/interim/general_prompts.csv")
finance_prompts_interim = pd.read_csv("data/interim/finance_prompts.csv")

finance_dataset_interim = (
    pd.concat([finance_prompts_interim, general_prompts_interim])
    .sample(frac=1)
    .reset_index(drop=True)
)

baai_embedding = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    providers=["CUDAExecutionProvider"]
)

In [None]:
# Split data
train_data = finance_dataset_interim.sample(n=800)
test_data = finance_dataset_interim.drop(train_data.index).sample(n=4000)

# GPT Classifier
gpt_classifier = GPT4oMini(
    api_key=os.getenv("OPENAI_API_KEY"),
    proxy_url=os.getenv("PROXY_URL"),
    model_name="gpt-4o-mini",
    domain="finance",
    train_data=train_data,
    test_data=test_data,
)

try:
    gpt_classifier.load_model("models/gpt-4o-mini-finance.json")

    test_predictions, test_actuals, test_latency = gpt_classifier.predict()
    print(f"Test Accuracy: {metrics.accuracy_score(test_actuals, test_predictions)}")

except Exception as e:
    print(f"Error running GPT model: {e}")


try:
    test_data = finance_dataset_interim.sample(n=30_000)
    # ModernBERT Classifier
    bert_classifier = ModernBERTNLI(domain="finance")
    bert_classifier.classifier.model.to("cuda")

    # Test predictions
    test_predictions = []
    test_times = []
    for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
        start_time = time.perf_counter_ns()
        pred = bert_classifier.predict(row["prompt"])
        test_predictions.append(pred)
        test_times.append(time.perf_counter_ns() - start_time)

    test_acc = metrics.accuracy_score(test_data["label"], test_predictions)

except Exception as e:
    print(f"Error running ModernBERT model: {e}")

In [None]:
train_data = finance_dataset.sample(frac=0.7).reset_index(drop=True)
test_data = finance_dataset.drop(train_data.index).reset_index(drop=True)

actuals = []
predictions = []
prediction_times = []

# fastText
try:
    fasttext_classifier = FastTextClassifier(train_data=train_data, test_data=test_data)
    fasttext_classifier.model = joblib.load("models/fastText_finance_fasttext.bin")

    for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
        text = str(row["prompt"])
        query = text.replace("\n", "")

        start_time = time.perf_counter_ns()
        prediction = fasttext_classifier.model.predict(query)
        end_time = time.perf_counter_ns()

        prediction_times.append(end_time - start_time)

        if prediction[0][0] == "__label__1":
            predictions.append(1)
        else:
            predictions.append(0)

        actuals.append(row["label"])

except Exception as e:
    print(f"Error running fastText model: {e}")

In [None]:

# Embedding test data
start_time = time.perf_counter_ns()
test_embeds = np.array(list(baai_embedding.embed(test_data["prompt"])))
end_time = time.perf_counter_ns()
embed_times = end_time - start_time

mean_embed_time = embed_times / len(train_data + test_data)

with open("models/SVM_finance_baai.pkl", "rb") as svm_file:
    svm_classifier = pkl.load(svm_file)

with open("models/XGBoost_finance_baai.pkl", "rb") as xgboost_file:
    xgboost_classifier = pkl.load(xgboost_file)

predictions = []
prediction_times = []

for _, test_embed in enumerate(test_embeds):
    start_time = time.perf_counter_ns()
    prediction = svm_classifier.predict(test_embed.reshape(1, -1))
    end_time = time.perf_counter_ns()

    prediction_times.append(end_time - start_time)
    predictions.append(prediction[0])


for _, test_embed in enumerate(test_embeds):
    start_time = time.perf_counter_ns()
    prediction = xgboost_classifier.predict(test_embed.reshape(1, -1))
    end_time = time.perf_counter_ns()

    prediction_times.append(end_time - start_time)
    predictions.append(prediction[0])