In [None]:
import torch
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import Dataset
import numpy as np
from sklearn.metrics import f1_score
import os
import time
import torch
from scipy.special import expit
torch.cuda.is_available()

False

In [3]:
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5}

In [50]:
test_dataset = Dataset.from_file(r"processed_dataset/test/data-00000-of-00001.arrow")
model = AutoModelForSequenceClassification.from_pretrained(f"./models/BERT_Multi-Label_classification", num_labels=len(label_map.keys()), hidden_dropout_prob=0.1)
tokenizer = AutoTokenizer.from_pretrained(f"./models/BERT_Multi-Label_classification")
# Define label map
label_map = {'toxic': 0, 'severe_toxic': 1, 'obscene': 2, 'threat': 3, 'insult': 4, 'identity_hate': 5}

# Create multi-label column
def create_multi_label(example):
    return {"labels": [np.float32(example[label]) for label in label_map.keys()]}

test_dataset = test_dataset.map(create_multi_label).remove_columns(list(label_map.keys()))
test_dataset[0]

{'id': '0001ea8717f6de06',
 'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.',
 'cyberbullying': 0,
 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [36]:
def preprocess_for_onnx(ex):
    tokens = tokenizer(
        ex["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="np"
    )
    return {
        "input_ids": tokens["input_ids"][0],
        "attention_mask": tokens["attention_mask"][0],
        "labels": ex["labels"]
    }
onnx_test_dataset = test_dataset.map(preprocess_for_onnx)
onnx_test_dataset[0]

Map: 100%|██████████| 63978/63978 [00:37<00:00, 1715.18 examples/s]


{'id': '0001ea8717f6de06',
 'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.',
 'cyberbullying': 0,
 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'input_ids': [101,
  4067,
  2017,
  2005,
  4824,
  1012,
  1045,
  2228,
  2200,
  3811,
  1997,
  2017,
  1998,
  2052,
  2025,
  7065,
  8743,
  2302,
  6594,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [53]:
def print_size_of_model(model,name):
    torch.save(model.state_dict(), "temp.p")
    print(name,' Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')
def print_size_of_onnx_model(model_path, name="ONNX Model"):
    size_mb = os.path.getsize(model_path) / 1e6
    print(f"{name} Size (MB): {size_mb:.2f}")

In [54]:
print_size_of_onnx_model("models/multi_label_model_quant.onnx","Original ONNX model")
print_size_of_model(model,"Original model")

Original ONNX model Size (MB): 110.38
Original model  Size (MB): 438.012793


## Code for exporting the BERT model into a ONNX model

In [15]:
dummy = tokenizer("This is a test", return_tensors="pt", padding="max_length", truncation=True, max_length=128)

torch.onnx.export(
    model,
    (dummy["input_ids"], dummy["attention_mask"]),
    "models/multi_label_model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "logits": {0: "batch_size", 1: "num_labels"},
    },
    opset_version=14
)

In [None]:
onnx_model = onnx.load("models/multi_label_model.onnx")
onnx.checker.check_model(onnx_model)
print("ONNX export successful and model is valid!")

ONNX export successful and model is valid!


### ONNX Quantization code 

In [33]:
model_input="models/multi_label_model.onnx"
model_output="models/multi_label_model_quant.onnx"
quantize_dynamic(
    model_input=model_input,
    model_output=model_output,
    weight_type=QuantType.QUInt8,
)



## Inference Section

In [47]:
def timed_onnx_inference(session, tokenizer, dataset, batch_size=32):
    start_time = time.time()
    all_preds = []
    all_labels = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]  # This is a dict of batched columns

        # Tokenize batch["comment_text"]
        tokens = tokenizer(
            batch["comment_text"],
            return_tensors="np",
            padding="max_length",
            truncation=True,
            max_length=128
        )

        inputs = {
            "input_ids": tokens["input_ids"].astype(np.int64),
            "attention_mask": tokens["attention_mask"].astype(np.int64)
        }

        # ONNX inference
        logits = session.run(["logits"], inputs)[0]
        probs = 1 / (1 + np.exp(-logits))  # Sigmoid

        preds = (probs > 0.5).astype(int)

        all_preds.extend(preds.tolist())
        all_labels.extend(batch["labels"])

    end_time = time.time()

    f1 = f1_score(all_labels, all_preds, average="micro")
    return f1, end_time - start_time

In [41]:
def timed_inference(model, tokenizer):
    start_time = time.time()
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0, return_all_scores=True)
    results = classifier(test_dataset['comment_text'], max_length=128, padding="max_length", truncation=True)
    end_time = time.time()
    cleaned_results = [
    [1 if score["score"] > 0.5 else 0 for score in result]
    for result in results
    ]
    f1 = f1_score(test_dataset['labels'], cleaned_results, average='micro')
    return f1, end_time - start_time

In [48]:
quantized_session = ort.InferenceSession("models/multi_label_model_quant.onnx",providers=["CUDAExecutionProvider"])
#originalModelScore = timed_inference(model, tokenizer)
quantizedModelScore = timed_onnx_inference(session=quantized_session, tokenizer=tokenizer,dataset=onnx_test_dataset)
#print(f"Original Model F1 Score: {originalModelScore[0]}, Time taken: {originalModelScore[1]} seconds")
print(f"Quantized Model F1 Score: {quantizedModelScore[0]}, Time taken: {quantizedModelScore[1]} seconds")

KeyboardInterrupt: 