In [None]:
!pip install olive-ai[auto-opt]
!pip install transformers onnxruntime-genai

#### Import

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from transformers import AutoTokenizer
import onnxruntime as ort
import random
import time

#### quantization

In [None]:
model_path = "models/bert_fine_tuned_model"        
quant_path = "models/bert_quantized"         
os.makedirs(quant_path, exist_ok=True)

In [None]:
!olive optimize \
    --model_name_or_path $model_path \
    --precision int8 \
    --output_path $quant_path

#### tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
session = ort.InferenceSession(f"{quant_path}/model.onnx")

In [None]:
def predict_batch(texts):
    inputs = tokenizer(texts, return_tensors="np", padding=True, truncation=True, max_length=128)
    ort_inputs = {k: v for k, v in inputs.items()}
    logits = session.run(None, ort_inputs)[0]
    return np.argmax(logits, axis=1)

texts = token_data["test"]["text"]
labels = np.array(token_data["test"]["label"])


In [None]:
batch_size = 8
preds = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    preds.extend(predict_batch(batch_texts))

y_pred = np.array(preds)
y_true = labels

#### mettric

In [None]:
acc = accuracy_score(y_true, y_pred)
macro_f1 = f1_score(y_true, y_pred, average='macro')
print(f"\nQuantized Model Accuracy: {acc:.4f}")
print(f"Quantized Model Macro-F1: {macro_f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=3))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix (Quantized Model)")
plt.show()

In [None]:
def get_size(path):
    size = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            size += os.path.getsize(os.path.join(dirpath, f))
    return size / (1024 * 1024)

size_fp32 = get_size(model_path)
size_int8 = get_size(quant_path)

print(f"\nModel size â€” FP32: {size_fp32:.2f} MB | INT8: {size_int8:.2f} MB | Reduction: {100*(1 - size_int8/size_fp32):.1f}%")


In [None]:
latencies = []
for i in range(20):
    sample_index = random.randint(0, len(token_data["test"]) - 1)
    text = token_data["test"][sample_index]["text"]
    inputs = tokenizer(text, return_tensors="np", truncation=True, padding=True, max_length=128)
    _ = session.run(None, dict(inputs))  
    start = time.time()
    for _ in range(10):  
        _ = session.run(None, dict(inputs))
    latencies.append((time.time() - start) / 10)

print(f"Mean latency (INT8): {np.mean(latencies)*1000:.2f} ms (n=20)")
