In [None]:
import os

os.chdir("../")
print(os.getcwd())

In [None]:
import random

import pandas as pd
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tqdm import tqdm
from xgboost import XGBClassifier

from prompt_classifier.metrics import evaluate
from prompt_classifier.modeling.dspy_gpt import GPT4oMini
from prompt_classifier.modeling.fasttext import FastTextClassifier
from prompt_classifier.modeling.nli_modernbert import ModernBERTNLI

load_dotenv()
random.seed(42)

In [3]:

law_prompts = pd.read_csv('data/processed/law_prompts.csv', sep=';')
general_prompts = pd.read_csv('data/processed/general_prompts.csv', sep=';')
healthcare_prompts = pd.read_csv('data/processed/healthcare_prompts.csv', sep=';')
finance_prompts = pd.read_csv('data/processed/finance_prompts.csv', sep=';')

law_dataset = pd.concat([law_prompts, general_prompts]).sample(frac=1).reset_index(drop=True)
healthcare_dataset = pd.concat([healthcare_prompts, general_prompts]).sample(frac=1).reset_index(drop=True)
finance_dataset = pd.concat([finance_prompts, general_prompts]).sample(frac=1).reset_index(drop=True)

datasets = {'law': law_dataset, 'healthcare': healthcare_dataset, 'finance': finance_dataset}

# GPT baseline optimization

In [None]:

for domain, dataset in datasets.items():
    train_data = dataset.sample(frac=0.0025)
    test_data = dataset.drop(train_data.index).head(1000)

    gpt_classifier = GPT4oMini(api_key=os.getenv("OPENAI_API_KEY"), proxy_url=os.getenv("PROXY_URL"), model_name="gpt-4o-mini",
                            domain=domain, train_data=train_data, test_data=test_data)


    gpt_classifier.optimize_model()
    predictions, actuals = gpt_classifier.predict()

    evaluate(predictions, actuals, domain)

    gpt_classifier.save_model(f"models/gpt-4o-mini-{domain}")

# Comparison loop

In [4]:
baai_embedding = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
tf_idf_embedding = TfidfVectorizer()
mini_embedding = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

embedding_models = {'baai': baai_embedding, 'tf_idf': tf_idf_embedding, 'mini': mini_embedding}

In [None]:
for domain, dataset in datasets.items():
    train_data = dataset.sample(frac=0.8)
    test_data = dataset.drop(train_data.index)

    actuals = []
    predictions = []

    # ModernBERT
    # bert_classifier = ModernBERTNLI(domain=domain)
    # for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
    #    prediction = bert_classifier.predict(row['prompt'])
    #    actuals.append(row['label'])

    #evaluate(predictions, actuals, domain, model_name='ModernBERT', embed_model='modernbert')

    actuals = []
    predictions = []

    # fastText
    fasttext_classifier = FastTextClassifier(train_data = train_data, test_data = test_data)
    fasttext_classifier.train()

    for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
        text = str(row['prompt'])
        query = text.replace('\n', '')
        prediction = fasttext_classifier.model.predict(query)

        if prediction[0][0] == '__label__1':
            predictions.append(1)
        else:
            predictions.append(0)

        actuals.append(row['label'])

    evaluate(predictions, actuals, domain, model_name='fastText', embed_model='fasttext')

    # SVM and XGBoost with different embeddings
    for model_name, embedding_model in embedding_models.items():

        if model_name == 'tf_idf':
            embedding_model.fit(train_data['prompt'])
            train_embeds = embedding_model.transform(train_data['prompt'])
            test_embeds = embedding_model.transform(test_data['prompt'])
        else:
            train_embeds = list(embedding_model.embed(train_data['prompt']))
            test_embeds = list(embedding_model.embed(test_data['prompt']))

        actuals = []
        predictions = []

        # SVM
        svm_classifier = SVC()
        svm_classifier.fit(train_embeds, train_data['label'])

        for i, row in test_data.iterrows():
            prediction = svm_classifier.predict(test_embeds[i])
            predictions.append(prediction[0])
            actuals.append(row['label'])

        evaluate(predictions, actuals, domain, model_name='SVM', embed_model=model_name)

        actuals = []
        predictions = []

        # XGBoost
        xgboost_classifier = XGBClassifier()
        xgboost_classifier.fit(train_embeds, train_data['label'])

        for i, row in test_data.iterrows():
            prediction = xgboost_classifier.predict(test_embeds[i])
            predictions.append(prediction[0])
            actuals.append(row['label'])

        evaluate(predictions, actuals, domain, model_name='XGBoost', embed_model=model_name)