In [1]:
import os
import pickle as pkl
import statistics
import time

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

load_dotenv()

import dataloader
import util

util.set_seed(22)

In [2]:
datasets = dataloader.get_domain_data()
eval_datasets = dataloader.get_eval_datasets()
batch_data = dataloader.get_batch_data()

batch_sizes = [1, 32, 64, 128, 256]

README.md:   0%|          | 0.00/410 [00:00<?, ?B/s]

(…)-00000-of-00001-9cc2ae3631bff610.parquet:   0%|          | 0.00/38.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24343 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/163 [00:00<?, ?B/s]

finance_questions_dataset.json:   0%|          | 0.00/53.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/53937 [00:00<?, ? examples/s]

(…)ta-ChatDoctor-HealthCareMagic-100k.jsonl:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19999 [00:00<?, ? examples/s]

Filter:   0%|          | 0/24343 [00:00<?, ? examples/s]

Filter:   0%|          | 0/24343 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53937 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53937 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19999 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19996 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [3]:
baai_embedding = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5", providers=["CUDAExecutionProvider"]
)
mini_embedding = TextEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    providers=["CUDAExecutionProvider"],
)

tfidf_embedding = TfidfVectorizer()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

[0;93m2025-05-08 13:14:46.625632107 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-08 13:14:46.625675278 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

[0;93m2025-05-08 13:14:50.421783206 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-08 13:14:50.423256444 [W:onnxruntime:, session_state.cc:1170 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [None]:
first_dataset = next(iter(datasets.values()))["prompt"]
train_prompts = first_dataset.sample(frac=0.8, random_state=22)

tfidf_embedding.fit(train_prompts)

with open("models/tfidf.pkl", "wb") as f:
    pkl.dump(tfidf_embedding, f)

# Create embedding cache directory
os.makedirs("cache/embeddings", exist_ok=True)

In [None]:
embedding_models = {
    "mini": mini_embedding,
    "tf_idf": tfidf_embedding,
    "baai": baai_embedding,
}

# Embedding Cache Utility Functions

In [None]:
def get_cached_embeddings(texts, model_name, domain, cache_dir="cache/embeddings", force_recompute=False):
    """Get embeddings from cache if available, otherwise compute and cache them.
    
    Args:
        texts: The texts to embed
        model_name: The name of the embedding model to use
        domain: The domain identifier for the cache
        cache_dir: Directory to store/retrieve cached embeddings
        force_recompute: If True, ignore cache and recompute embeddings
    
    Returns:
        The embeddings matrix
    """
    cache_file = f"{cache_dir}/{domain}_{model_name}_embeddings.pkl"

    # Check if cache exists and we're not forcing recomputation
    if os.path.exists(cache_file) and not force_recompute:
        print(f"Loading cached embeddings for {domain} using {model_name}")
        with open(cache_file, 'rb') as f:
            return pkl.load(f)

    # Cache doesn't exist or forced recomputation
    if force_recompute:
        print(f"Force recomputing embeddings for {domain} using {model_name}...")
    else:
        print(f"Computing embeddings for {domain} using {model_name}...")

    if model_name == "tf_idf":
        embeddings = tfidf_embedding.transform(texts)
    else:
        # Get the appropriate embedding model
        embed_model = embedding_models[model_name]

        # Process in batches for better memory efficiency
        batch_size = 1  # Adjust based on available RAM
        all_embeddings = []

        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            batch_embeddings = list(embed_model.embed(batch_texts))
            all_embeddings.extend(batch_embeddings)

        embeddings = np.array(all_embeddings)

    # Cache the results
    with open(cache_file, 'wb') as f:
        pkl.dump(embeddings, f)

    return embeddings

# Train

In [None]:
for domain, dataset in datasets.items():
    train_data = dataset.sample(frac=0.8, random_state=22).reset_index(drop=True)
    test_data = dataset.drop(train_data.index).reset_index(drop=True)

    actuals = []
    predictions = []
    prediction_times = []

    for model_name, embedding_model in embedding_models.items():
        start_time = time.perf_counter_ns()

        # Get cached or compute new embeddings
        train_embeds = get_cached_embeddings(train_data["prompt"], model_name, f"{domain}_train")
        test_embeds = get_cached_embeddings(test_data["prompt"], model_name, f"{domain}_test")

        end_time = time.perf_counter_ns()
        embed_times = end_time - start_time
        mean_embed_time = embed_times / len(train_data + test_data)

        print(f"Embedding time for {model_name}: {mean_embed_time} ns")

        # Train and evaluate SVM model
        util.train_and_evaluate_model(
            model_name="SVM",
            train_embeds=train_embeds,
            test_embeds=test_embeds,
            train_labels=train_data["label"],
            test_labels=test_data["label"],
            domain=domain,
            embed_model=model_name,
            save_path=f"models/SVM_{domain}_{model_name}.pkl",
            embedding_time=mean_embed_time,
            training=True,
        )


# Eval

In [None]:
# Load TF-IDF model
with open("models/tfidf.pkl", "rb") as f:
    tfidf_embedding = pkl.load(f)

embedding_models = {
    "mini": mini_embedding,
    "tf_idf": tfidf_embedding,
    "baai": baai_embedding,
}

In [None]:
for embed_model_name, embedding_model in embedding_models.items():
    # Load SVM models
    with open(f"models/SVM_finance_{embed_model_name}.pkl", "rb") as f:
        svm_finance = pkl.load(f)
    with open(f"models/SVM_healthcare_{embed_model_name}.pkl", "rb") as f:
        svm_healthcare = pkl.load(f)
    with open(f"models/SVM_law_{embed_model_name}.pkl", "rb") as f:
        svm_law = pkl.load(f)

    for domain, inference_df in eval_datasets.items():
        # Get actual labels once
        actuals_ml = inference_df["label"].tolist()

        # Use cached embeddings or compute new ones
        test_embeds = get_cached_embeddings(inference_df["prompt"], embed_model_name, f"{domain}_eval")

        predictions_svm = []
        prediction_times_svm = []

        # Make batch predictions instead of one-by-one
        start_time = time.perf_counter_ns()
        pred_finance = svm_finance.predict(test_embeds)
        pred_healthcare = svm_healthcare.predict(test_embeds)
        pred_law = svm_law.predict(test_embeds)
        end_time = time.perf_counter_ns()

        # Distribute the prediction time across all samples for latency calculation
        prediction_time = end_time - start_time
        prediction_times_svm = [prediction_time / test_embeds.shape[0]] * test_embeds.shape[0]

        # Combine predictions
        predictions_svm = [
            0 if (f == 1 or h == 1 or l == 1) else 1
            for f, h, l in zip(pred_finance, pred_healthcare, pred_law, strict=True)
        ]

        # Evaluate results
        util.evaluate_run(
            predictions=predictions_svm,
            true_labels=actuals_ml,
            latency=statistics.mean(prediction_times_svm),
            domain=domain,
            embed_model=embed_model_name,
            model_name="SVM",
            train_acc=0.0,
            cost=0.0,
            training=False,
        )

# Batch

In [None]:
for embedding_model_name in ["mini", "baai", "tf_idf"]:
    svm_batch_results = []
    # Load SVM models
    with open(f"models/SVM_finance_{embedding_model_name}.pkl", "rb") as f:
        svm_finance = pkl.load(f)
    with open(f"models/SVM_healthcare_{embedding_model_name}.pkl", "rb") as f:
        svm_healthcare = pkl.load(f)
    with open(f"models/SVM_law_{embedding_model_name}.pkl", "rb") as f:
        svm_law = pkl.load(f)

    for batch_size in batch_sizes:
        print(f"Processing batch size {batch_size} with {embedding_model_name} embeddings")
        batches = [
            batch_data[i : i + batch_size]
            for i in range(0, len(batch_data), batch_size)
        ]
        for batch in tqdm(batches):
            batch_metrics = {
                "embed_time": 0,
                "svm_law_time": 0,
                "svm_finance_time": 0,
                "svm_health_time": 0,
            }

            # Time embeddings
            start_time = time.perf_counter()
            embedding_model = embedding_models[embedding_model_name]
            if embedding_model_name == "tf_idf":
                embeds = embedding_model.transform(batch)
            else:
                embeds = np.array(list(embedding_model.embed(batch)))
            batch_metrics["embed_time"] += time.perf_counter() - start_time

            # Get all predictions and time them
            start_time = time.perf_counter()
            svm_law_preds = svm_law.predict(embeds)
            batch_metrics["svm_law_time"] += time.perf_counter() - start_time

            start_time = time.perf_counter()
            svm_finance_preds = svm_finance.predict(embeds)
            batch_metrics["svm_finance_time"] += time.perf_counter() - start_time

            start_time = time.perf_counter()
            svm_health_preds = svm_healthcare.predict(embeds)
            batch_metrics["svm_health_time"] += time.perf_counter() - start_time

            # Create a list of dictionaries, one for each prompt in the batch
            results = []
            for law_pred, finance_pred, health_pred in zip(svm_law_preds, svm_finance_preds, svm_health_preds, strict=True):
                results.append({
                    'finance': int(finance_pred),
                    'healthcare': int(health_pred),
                    'law': int(law_pred)
                })

            # Record results for this batch
            svm_batch_results.append(
                {
                    "batch_size": batch_size,
                    "time_taken_embed": batch_metrics["embed_time"],
                    "time_taken_law": batch_metrics["svm_law_time"],
                    "time_taken_finance": batch_metrics["svm_finance_time"],
                    "time_taken_healthcare": batch_metrics["svm_health_time"],
                    "results": results,
                    "model_name": "svm",
                    "embedding_model": embedding_model_name,
                    "embedding": True,
                }
            )

    pd.DataFrame(svm_batch_results).to_csv(
        f"data/results/batch_svm_{embedding_model_name}.csv", index=False
    )


In [None]:
# Verify results format
if svm_batch_results:
    print(f"Number of batch results: {len(svm_batch_results)}")
    print(f"First batch size: {svm_batch_results[0]['batch_size']}")
    print(f"Number of results in first batch: {len(svm_batch_results[0]['results'])}")
    print("\nSample results from first batch:")
    for i in range(min(3, len(svm_batch_results[0]['results']))):
        print(f"  Result {i+1}: {svm_batch_results[0]['results'][i]}")