In [None]:
import os

os.chdir("../")
print(os.getcwd())

In [None]:
import time
import statistics
import pickle
import random  
import wandb

import numpy as np
import onnxruntime as ort
import pandas as pd
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn import metrics

from prompt_classifier.metrics import evaluate
from prompt_classifier.modeling.dspy_gpt import GPT4oMini
from prompt_classifier.modeling.fasttext import FastTextClassifier
from prompt_classifier.modeling.nli_modernbert import ModernBERTNLI

load_dotenv()
random.seed(42)

In [None]:
providers = ort.get_available_providers()

print(providers)

In [4]:
law_prompts = pd.read_csv('data/processed/law_prompts.csv')
general_prompts = pd.read_csv('data/processed/general_prompts.csv')
healthcare_prompts = pd.read_csv('data/processed/healthcare_prompts.csv')
finance_prompts = pd.read_csv('data/processed/finance_prompts.csv')

law_dataset = pd.concat([law_prompts, general_prompts]).sample(frac=1).reset_index(drop=True)
healthcare_dataset = pd.concat([healthcare_prompts, general_prompts]).sample(frac=1).reset_index(drop=True)
finance_dataset = pd.concat([finance_prompts, general_prompts]).sample(frac=1).reset_index(drop=True)

datasets = {'law': law_dataset, 'healthcare': healthcare_dataset, 'finance': finance_dataset}

In [5]:
law_prompts_interim = pd.read_csv('data/interim/law_prompts.csv')
general_prompts_interim = pd.read_csv('data/interim/general_prompts.csv')
healthcare_prompts_interim = pd.read_csv('data/interim/healthcare_prompts.csv')
finance_prompts_interim = pd.read_csv('data/interim/finance_prompts.csv')

law_dataset_interim = pd.concat([law_prompts_interim, general_prompts_interim]).sample(frac=1).reset_index(drop=True)
healthcare_dataset_interim = pd.concat([healthcare_prompts_interim, general_prompts_interim]).sample(frac=1).reset_index(drop=True)
finance_dataset_interim = pd.concat([finance_prompts_interim, general_prompts_interim]).sample(frac=1).reset_index(drop=True)

datasets_interim = {'law': law_dataset_interim, 'healthcare': healthcare_dataset_interim, 'finance': finance_dataset_interim}

In [6]:
baai_embedding = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5", providers=["CUDAExecutionProvider"]
)
mini_embedding = TextEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2", providers=["CUDAExecutionProvider"]
    )

tfidf = TfidfVectorizer()

embedding_models = {'mini': mini_embedding, 'tfidf': tfidf, 'baai': baai_embedding}

In [None]:
print(f'BAAI-BGE available providers: {baai_embedding.model.model.get_providers()}')
print(f'MiniLM available providers: {mini_embedding.model.model.get_providers()}')

# GPT and ModernBERT loop using interim data

In [None]:

for domain, dataset in datasets_interim.items():
    train_data = dataset.sample(frac=0.00025)
    test_data = dataset.drop(train_data.index).head(100)

    gpt_classifier = GPT4oMini(api_key=os.getenv("OPENAI_API_KEY"), proxy_url=os.getenv("PROXY_URL"), model_name="gpt-4o-mini",
                            domain=domain, train_data=train_data, test_data=test_data)


    #gpt_classifier.optimize_model()

    #predictions, actuals, mean_latency = gpt_classifier.predict()

    #evaluate(predictions=predictions, true_labels=actuals, domain=domain, model_name="gpt4o-mini", embed_model="ada-002", cost=gpt_classifier.cost, latency=mean_latency)

    #gpt_classifier.save_model(f"models/gpt-4o-mini-{domain}.json")

    actuals = []
    predictions = []
    prediction_times = []

    # ModernBERT
    bert_classifier = ModernBERTNLI(domain=domain)
    for _, row in tqdm(dataset.iterrows(), total=len(dataset)):

        start_time = time.perf_counter_ns()
        prediction = bert_classifier.predict(row['prompt'])
        end_time = time.perf_counter_ns()

        actuals.append(row['label'])
        prediction_times.append(end_time - start_time)

        mean_prediction_time = statistics.mean(prediction_times)

    evaluate(predictions, actuals, domain, model_name='ModernBERT', embed_model='modernbert', latency=mean_prediction_time)


# SVM, fastText and XGBoost loop using processed data

In [8]:
def train_and_evaluate_model(
    model_name: str, 
    train_embeds: np.ndarray, 
    test_embeds: np.ndarray, 
    train_labels: pd.Series, 
    test_labels: pd.Series, 
    domain: str, 
    embed_model: str, 
    save_path: str,
    model_params: dict = None
) -> None:
    
    # Initialize W&B run
    wandb.init(project="bc-prompt-classification", name=f"{model_name}_{embed_model}_{domain}", config=model_params)

    # Initialize the classifier
    if model_name == 'SVM':
        valid_params = {key: value for key, value in (model_params or {}).items() if key in ['C', 'kernel']}
        classifier = SVC(**valid_params, probability=True)
    elif model_name == 'XGBoost':
        valid_params = {key: value for key, value in (model_params or {}).items() if key in ['max_depth', 'learning_rate', 'n_estimators']}
        classifier = XGBClassifier(n_jobs=-1, **valid_params)
    else:
        raise ValueError("Invalid model_name. Choose 'SVM' or 'XGBoost'.")

    print(f"Training {embed_model} embeddings on {domain} domain using {model_name}")

    # Train the model
    classifier.fit(train_embeds, train_labels)

    # Calculate training accuracy and loss
    train_predictions = classifier.predict(train_embeds)
    train_acc = metrics.accuracy_score(train_labels, train_predictions)
    train_loss = metrics.log_loss(train_labels, classifier.predict_proba(train_embeds))
    print(f"Training Loss for {model_name} on {domain} domain: {train_loss}")

    predictions = []
    prediction_times = []

    # Evaluate the model on test data
    for _, test_embed in enumerate(tqdm(test_embeds, desc=f"Evaluating {model_name} on {domain}")):
        start_time = time.perf_counter_ns()
        prediction = classifier.predict(test_embed.reshape(1, -1))
        end_time = time.perf_counter_ns()

        prediction_times.append(end_time - start_time)
        predictions.append(prediction[0])

    mean_prediction_time = statistics.mean(prediction_times)

    # Calculate test loss
    test_loss = metrics.log_loss(test_labels, classifier.predict_proba(test_embeds))
    print(f"Test Loss for {model_name} on {domain} domain: {test_loss}")

    # Evaluate the predictions
    evaluation_metrics = evaluate(
        predictions, 
        test_labels, 
        domain, 
        model_name=model_name, 
        embed_model=embed_model,
        latency=mean_prediction_time, 
        train_acc=train_acc
    )

embedding_models = {'mini': mini_embedding, 'tf_idf': tf_idf_embedding, 'baai': baai_embedding}

In [None]:
print(f'BAAI-BGE available providers: {baai_embedding.model.model.get_providers()}')
print(f'MiniLM available providers: {mini_embedding.model.model.get_providers()}')

In [None]:
for domain, dataset in datasets.items():
    train_data = dataset.sample(frac=0.7).reset_index(drop=True)
    test_data = dataset.drop(train_data.index).reset_index(drop=True)

    actuals = []
    predictions = []
    prediction_times = []

    # fastText
    fasttext_classifier = FastTextClassifier(train_data=train_data, test_data=test_data)
    fasttext_classifier.train()

    train_predictions = []
    for _, row in train_data.iterrows():
        query = str(row['prompt']).replace('\n', '')
        prediction = fasttext_classifier.model.predict(query)
        train_predictions.append(1 if prediction[0][0] == '__label__1' else 0)
    train_acc_fasttext = metrics.accuracy_score(train_data['label'], train_predictions)


    for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
        text = str(row['prompt'])
        query = text.replace('\n', '')

        start_time = time.perf_counter_ns()
        prediction = fasttext_classifier.model.predict(query)
        end_time = time.perf_counter_ns()

        prediction_times.append(end_time - start_time)

        if prediction[0][0] == '__label__1':
            predictions.append(1)
        else:
            predictions.append(0)

        actuals.append(row['label'])

    mean_prediction_time = statistics.mean(prediction_times)
    evaluate(predictions, actuals, domain, model_name='fastText', embed_model='fasttext', 
            latency=mean_prediction_time, train_acc=train_acc_fasttext)

    fasttext_classifier.model.save_model(f"models/fastText_{domain}_fasttext.bin")

    for model_name, embedding_model in embedding_models.items():


        if model_name == 'tf_idf':
            embedding_model.fit(train_data['prompt'])
            with open(f"models/tfidf_{domain}.pkl", 'wb') as f:
                pickle.dump(embedding_model, f)
            train_embeds = embedding_model.transform(train_data['prompt'])
            test_embeds = embedding_model.transform(test_data['prompt'])

            sweep_config = {
                "method": "bayes",
                "metric": {"name": "train_loss", "goal": "minimize"},
                "parameters": {
                    "C": {"min": 0.1, "max": 10, "distribution": "log_uniform"},
                    "kernel": {"values": ["linear", "rbf", "poly"]}
                }
            }

        else:
            train_embeds = np.array(list(embedding_model.embed(train_data['prompt'])))
            test_embeds = np.array(list(embedding_model.embed(test_data['prompt'])))

            sweep_config = {
                "method": "bayes",
                "metric": {"name": "train_loss", "goal": "minimize"},
                "parameters": {
                    "max_depth": {"min": 3, "max": 10},
                    "learning_rate": {"min": 0.01, "max": 0.3},
                    "n_estimators": {"min": 50, "max": 500}
                }
            }


        # Train and evaluate SVM model
        train_and_evaluate_model(
            model_name='SVM',
            train_embeds=train_embeds,
            test_embeds=test_embeds,
            train_labels=train_data['label'],
            test_labels=test_data['label'],
            domain=domain,
            embed_model=model_name,
            save_path=f'models/SVM_{domain}_{model_name}.pkl',
            model_params=sweep_config,
        )

        # Train and evaluate XGBoost model
        train_and_evaluate_model(
            model_name='XGBoost',
            train_embeds=train_embeds,
            test_embeds=test_embeds,
            train_labels=train_data['label'],
            test_labels=test_data['label'],
            domain=domain,
            embed_model=model_name,
            save_path=f'models/XGBoost_{domain}_{model_name}.json',
            model_params=sweep_config,
        )