In [None]:
!pip install -U transformers
!pip install -U torch
!pip install -U joblib
!pip install -U onnx
!pip install -U onnxruntime
!pip install -U scikit-learn
!pip install -U datasets

In [1]:
from transformers import DistilBertTokenizer
import torch
import joblib
from torch.utils.data import DataLoader, Dataset
import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import os
import time
from datasets import Dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [2]:
# Load model and tokenizer
model = joblib.load("binary_classification_kd.joblib")  # if you used joblib to save a wrapped torch model
print((model))

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [3]:
test_dataset = Dataset.from_file(r"../subset_test/data-00000-of-00001.arrow")
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5}
# remove uncessary columns
test_dataset = test_dataset.remove_columns(list(label_map.keys()))
print(test_dataset[0])

# define
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

{'id': 'fd74868bc7a076e1', 'comment_text': "What the hell, what would we call her if we wanted to say she was black? I'm talking about her race.", 'cyberbullying': 1}


In [4]:
def preprocess_for_onnx(ex):
    tokens = tokenizer(
        ex["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="np"
    )
    return {
        "input_ids": tokens["input_ids"][0],
        "attention_mask": tokens["attention_mask"][0],
        "labels": ex["cyberbullying"]
    }
onnx_test_dataset = test_dataset.map(preprocess_for_onnx)

## Code for exporting the BERT model into a ONNX model

In [36]:
dummy = tokenizer("This is a test", return_tensors="pt", padding="max_length", truncation=True, max_length=128)
dummy.to(device)  # Move model to GPU if available
torch.onnx.export(
    model,
    (dummy["input_ids"], dummy["attention_mask"]),
    "binary_distill_model.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "seq_len"},
        "attention_mask": {0: "batch_size", 1: "seq_len"},
        "logits": {0: "batch_size", 1: "num_labels"},
    },
    opset_version=14
)

In [37]:
onnx_model = onnx.load("binary_distill_model.onnx")
onnx.checker.check_model(onnx_model)
print("ONNX export successful and model is valid!")

ONNX export successful and model is valid!


## ONNX quantization code

In [38]:
model_input="binary_distill_model.onnx"
model_output="binary_distill_model_quant.onnx"
quantize_dynamic(
    model_input=model_input,
    model_output=model_output,
    weight_type=QuantType.QUInt8,
)



In [39]:
def print_size_of_model(model,name):
    torch.save(model.state_dict(), "temp.p")
    print(name,' Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')
def print_size_of_onnx_model(model_path, name="ONNX Model"):
    size_mb = os.path.getsize(model_path) / 1e6
    print(f"{name} Size (MB): {size_mb:.2f}")

In [41]:
print_size_of_onnx_model("binary_distill_model_quant.onnx","Quant ONNX model")
print_size_of_model(model,"Original model")

Quant ONNX model Size (MB): 67.38
Original model  Size (MB): 267.854622


# Inference Section

In [5]:
def timed_onnx_inference(session, tokenizer, dataset, batch_size=32):
    start_time = time.time()
    all_preds = []
    all_labels = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]  # This is a dict of batched columns

        # Tokenize batch["comment_text"]
        tokens = tokenizer(
            batch["comment_text"],
            return_tensors="np",
            padding="max_length",
            truncation=True,
            max_length=128
        )

        inputs = {
            "input_ids": tokens["input_ids"].astype(np.int64),
            "attention_mask": tokens["attention_mask"].astype(np.int64)
        }

        # ONNX inference
        logits = session.run(["logits"], inputs)[0]
        probs = 1 / (1 + np.exp(-logits))  # Sigmoid
        preds = [prob.index(1) for prob in (probs > 0.5).astype(int).tolist()]
        # print(preds)
        all_preds.extend(preds)
        all_labels.extend(batch["cyberbullying"])

    end_time = time.time()
    # print(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="micro")
    return f1, end_time - start_time

def timed_inference(model, tokenizer):
    start_time = time.time()
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, return_all_scores=False)
    results = classifier(test_dataset['comment_text'], max_length=128, padding="max_length", truncation=True)
    end_time = time.time()
    cleaned_results = [int(result["label"][-1]) for result in results]
    f1 = f1_score(test_dataset['cyberbullying'], cleaned_results, average='micro')
    return f1, end_time - start_time

In [6]:
ort.get_available_providers()

['AzureExecutionProvider', 'CPUExecutionProvider']

In [11]:
model.eval()
# providers = ["CUDAExecutionProvider"] if torch.cuda.is_available() else ["CPUExecutionProvider"]
providers = ["CPUExecutionProvider"]
quantized_session = ort.InferenceSession("binary_distill_model_quant.onnx",providers=providers)
originalModelScore = timed_inference(model, tokenizer)
print(f"Original Model F1 Score: {originalModelScore[0]}, Time taken: {originalModelScore[1]} seconds")
quantizedModelScore = timed_onnx_inference(session=quantized_session, tokenizer=tokenizer,dataset=onnx_test_dataset)
print(f"Quantized Model F1 Score: {quantizedModelScore[0]}, Time taken: {quantizedModelScore[1]} seconds")

Device set to use cuda


Original Model F1 Score: 0.902, Time taken: 2.9888014793395996 seconds
Quantized Model F1 Score: 0.902, Time taken: 51.64895009994507 seconds
