In [2]:
import torch
import warnings
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
warnings.filterwarnings('ignore')
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import os
import time
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5}
output_dir = "simCSE_models\simcse_binary"
model = AutoModelForSequenceClassification.from_pretrained(output_dir, num_labels=2, hidden_dropout_prob=0.1).to(device)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model.resize_token_embeddings(len(tokenizer)) # need to resize due to new tokens added

Embedding(30522, 768, padding_idx=0)

In [5]:
test_dataset = Dataset.from_file(r"subset_test/data-00000-of-00001.arrow")
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5}
# remove uncessary columns
test_dataset = test_dataset.remove_columns(list(label_map.keys()))
print(test_dataset[0])
def preprocess_for_onnx(ex):
    tokens = tokenizer(
        ex["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="np"
    )
    return {
        "input_ids": tokens["input_ids"][0],
        "attention_mask": tokens["attention_mask"][0],
        "labels": ex["cyberbullying"]
    }
onnx_test_dataset = test_dataset.map(preprocess_for_onnx)
onnx_test_dataset[0]['labels']

{'id': 'fd74868bc7a076e1', 'comment_text': "What the hell, what would we call her if we wanted to say she was black? I'm talking about her race.", 'cyberbullying': 1}


Map: 100%|██████████| 500/500 [00:00<00:00, 2483.53 examples/s]


1

### Quantize Section

In [6]:
dummy = tokenizer("This is a test", return_tensors="pt", padding="max_length", truncation=True, max_length=128)
dummy.to(device)  
onnx_model_name = "simCSE_models/simcse_binary_model.onnx"
torch.onnx.export(
    model,
    (dummy["input_ids"], dummy["attention_mask"]),
    onnx_model_name,
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "logits": {0: "batch_size", 1: "num_labels"},
    },
    opset_version=14
)

In [7]:
onnx_model = onnx.load(onnx_model_name)
onnx.checker.check_model(onnx_model)
print("ONNX export successful and model is valid!")

ONNX export successful and model is valid!


In [8]:
model_input=onnx_model_name
model_output="simCSE_models/simcse_binary_model_quantized.onnx"
quantize_dynamic(
    model_input=model_input,
    model_output=model_output,
    weight_type=QuantType.QUInt8,
)



In [9]:
def print_size_of_model(model,name):
    torch.save(model.state_dict(), "temp.p")
    print(name,' Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')
def print_size_of_onnx_model(model_path, name="ONNX Model"):
    size_mb = os.path.getsize(model_path) / 1e6
    print(f"{name} Size (MB): {size_mb:.2f}")
print_size_of_model(model, "PyTorch Model")
print_size_of_onnx_model(model_output, "ONNX Model")

PyTorch Model  Size (MB): 438.000505
ONNX Model Size (MB): 110.38


### Inference Section

In [14]:
def timed_onnx_inference(session, tokenizer, dataset, batch_size=32):
    start_time = time.time()
    all_preds = []
    all_labels = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]  # This is a dict of batched columns

        # Tokenize batch["comment_text"]
        tokens = tokenizer(
            batch["comment_text"],
            return_tensors="np",
            padding="max_length",
            truncation=True,
            max_length=128
        )

        inputs = {
            "input_ids": tokens["input_ids"].astype(np.int64),
            "attention_mask": tokens["attention_mask"].astype(np.int64)
        }

        # ONNX inference
        logits = session.run(["logits"], inputs)[0]
        probs = 1 / (1 + np.exp(-logits))  # Sigmoid
        preds = [np.argmax(prob) for prob in probs]
        # print(preds)
        all_preds.extend(preds)
        all_labels.extend(batch["cyberbullying"])

    end_time = time.time()
    # print(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="micro")
    acc = accuracy_score(all_labels, all_preds)
    return f1, end_time - start_time, acc

def timed_inference(model, tokenizer):
    start_time = time.time()
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, return_all_scores=False)
    results = classifier(test_dataset['comment_text'], max_length=128, padding="max_length", truncation=True)
    end_time = time.time()
    cleaned_results = [int(result["label"][-1]) for result in results]
    f1 = f1_score(test_dataset['cyberbullying'], cleaned_results, average='micro')
    acc = accuracy_score(test_dataset['cyberbullying'], cleaned_results)
    return f1, end_time - start_time, acc

In [15]:
model.eval()
providers = ["CUDAExecutionProvider"] if torch.cuda.is_available() else ["CPUExecutionProvider"]
#providers = ["CPUExecutionProvider"]
quantized_session = ort.InferenceSession(model_output,providers=providers)
originalModelScore = timed_inference(model, tokenizer)
print(f"Original Model F1 Score: {originalModelScore[0]}, Time taken: {originalModelScore[1]} seconds")
print(f"Original Model Accuracy: {originalModelScore[2]}")
quantizedModelScore = timed_onnx_inference(session=quantized_session, tokenizer=tokenizer,dataset=onnx_test_dataset)
print(f"Quantized Model F1 Score: {quantizedModelScore[0]}, Time taken: {quantizedModelScore[1]} seconds")
print(f"Quantized Model Accuracy: {quantizedModelScore[2]}")

Device set to use cpu


Original Model F1 Score: 0.896, Time taken: 48.78588676452637 seconds
Original Model Accuracy: 0.896
Quantized Model F1 Score: 0.896, Time taken: 14.322396993637085 seconds
Quantized Model Accuracy: 0.896
