# Embedding Model & System-Level Benchmarks

This notebook covers:
1. Loading the bi-encoder from MLflow  
2. PyTorch vs. ONNX vs. quantized inference  
3. FAISS end-to-end search latency  
4. Triton inference server benchmarking via `perf_analyzer`  
5. HTTP-level concurrency tests  

In [2]:
# Cell 1: Install dependencies
# !pip install -q mlflow sentence-transformers torch faiss-cpu onnx onnxruntime onnxruntime-gpu onnxruntime-tools datasets
# !pip uninstall torch torchvision torchaudio -y
# !pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128


%env MLFLOW_TRACKING_USERNAME=admin
%env MLFLOW_TRACKING_PASSWORD=password

env: MLFLOW_TRACKING_USERNAME=admin
env: MLFLOW_TRACKING_PASSWORD=password


In [3]:
# Cell 2: Imports & Settings

import os
import time
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import faiss
import requests
import torch
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
from sentence_transformers import SentenceTransformer, models
from transformers import AutoTokenizer

# Sample data for FAISS
CORPUS = ["This is document A", "Another paper B", "Yet another doc C"]
QUERIES = ["role of mitochondria", "deep learning semantics"]

# Local paths
LOCAL_MODEL_DIR = "/home/pb/projects/course/sem2/mlops/project/mlops/models/artifacts/model/model.sentence_transformer"
ONNX_DIR = "onnx_models"
os.makedirs(ONNX_DIR, exist_ok=True)


In [4]:
# Cell 3: Utilities

def time_function(fn, *args, **kwargs):
    """Return (result, elapsed_seconds)."""
    start = time.perf_counter()
    out = fn(*args, **kwargs)
    return out, (time.perf_counter() - start)

def build_faiss_index(embs: np.ndarray):
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    faiss.normalize_L2(embs)
    index.add(embs)
    return index


In [5]:
# # Cell 4: Load and prepare the embedding model
# from mlflow.tracking import MlflowClient
# from sentence_transformers import SentenceTransformer

# MLFLOW_MODEL_NAME = "arxiv-bi-encoder-longformer"
# MLFLOW_MODEL_VERSION = "1"

# _client = MlflowClient()

# def load_embedding_model_from_registry(name: str, version: str) -> SentenceTransformer:
#     """
#     Download only the `model/` subdirectory for the given registered model version,
#     then load it with SentenceTransformer.
#     """
#     # 1) Fetch the model version metadata
#     mv = _client.get_model_version(name, version)
#     run_id = mv.run_id

#     # 2) Download only the 'model' artifact dir (not checkpoints)
#     local_model_dir = _client.download_artifacts(run_id, "model")

#     # 3) Load with SentenceTransformer
#     return SentenceTransformer(local_model_dir)

# # Usage:
# embed_model = load_embedding_model_from_registry(MLFLOW_MODEL_NAME, MLFLOW_MODEL_VERSION)
# print("Loaded embedding model from:", embed_model._first_module().save_directory)


In [6]:
# Cell 4: Load Local SentenceTransformer Model

print(f"▶ Loading SentenceTransformer modules from: {LOCAL_MODEL_DIR}")

# Transformer encoder module (offline)
transformer_module = models.Transformer(
    LOCAL_MODEL_DIR,
    max_seq_length=512
)
# Pooling head
pooling_module = models.Pooling(
    transformer_module.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

# Assemble & move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = SentenceTransformer(
    modules=[transformer_module, pooling_module],
    device=device
)

print(f"✅ Loaded model onto device: {device}")
# Sanity check
vec = embed_model.encode("Test embedding", convert_to_numpy=True)
print("Embedding vector shape:", vec.shape)


▶ Loading SentenceTransformer modules from: /home/pb/projects/course/sem2/mlops/project/mlops/models/artifacts/model/model.sentence_transformer
✅ Loaded model onto device: cuda
Embedding vector shape: (768,)


In [7]:
# Cell 5: Baseline PyTorch inference (single & batch)
# Single sample
_, dt = time_function(embed_model.encode, QUERIES[0], convert_to_numpy=True)
print(f"PyTorch single-sample latency: {dt*1000:.2f} ms")

# Batches
for B in [1,8,16,32]:
    batch = [QUERIES[0]]*B
    _, dt = time_function(embed_model.encode, batch, convert_to_numpy=True)
    print(f"PyTorch batch={B} throughput: {B/dt:.1f} QPS, latency p50≈{(dt/B)*1000:.2f} ms")


PyTorch single-sample latency: 11.77 ms
PyTorch batch=1 throughput: 162.7 QPS, latency p50≈6.15 ms
PyTorch batch=8 throughput: 255.0 QPS, latency p50≈3.92 ms
PyTorch batch=16 throughput: 1365.3 QPS, latency p50≈0.73 ms
PyTorch batch=32 throughput: 2209.2 QPS, latency p50≈0.45 ms


In [8]:
# Cell 6: ONNX Export & Unoptimized ONNXRuntime Inference

# 1) Export the encoder to ONNX
encoder = embed_model._first_module().auto_model.eval().cpu()
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIR, local_files_only=True)

# Prepare dummy input
sample = tokenizer(
    "test", return_tensors="pt",
    max_length=300, padding="max_length", truncation=True
)
torch_inputs = (sample["input_ids"], sample["attention_mask"])

onnx_path = os.path.join(ONNX_DIR, "embed.onnx")
import torch
torch.onnx.export(
    encoder,
    torch_inputs,
    onnx_path,
    input_names=["input_ids", "attention_mask"],
    output_names=["last_hidden_state"],
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}},
    opset_version=17
)
print(f"✅ Exported ONNX model to: {onnx_path}")

# 2) Benchmark unoptimized ONNXRuntime
ort_sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])

def ort_encode(texts):
    toks = tokenizer(
        texts, return_tensors="np",
        max_length=300, padding="max_length", truncation=True
    )
    last_hidden = ort_sess.run(
        ["last_hidden_state"],
        {"input_ids": toks["input_ids"], "attention_mask": toks["attention_mask"]}
    )[0]
    mask = np.expand_dims(toks["attention_mask"], -1)
    embeddings = (last_hidden * mask).sum(1) / np.clip(mask.sum(1), 1e-9, None)
    return embeddings

_, dt = time_function(ort_encode, [QUERIES[0]])
print(f"ONNXRuntime (CPU) latency: {dt*1000:.2f} ms")


  torch.onnx.export(


✅ Exported ONNX model to: onnx_models/embed.onnx
ONNXRuntime (CPU) latency: 269.88 ms


In [9]:
# Cell 7: Graph Optimization & Dynamic Quantization Benchmarks

import os
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType

# Define paths
ONNX_PATH       = os.path.join(ONNX_DIR, "embed.onnx")
OPT_ONNX_PATH   = os.path.join(ONNX_DIR, "embed_opt.onnx")
DYN_QUANT_PATH  = os.path.join(ONNX_DIR, "embed_dyn.onnx")

# 1) Graph‐optimized ONNX
opt_so = ort.SessionOptions()
opt_so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
# tell ORT to write out the optimized graph
opt_so.optimized_model_filepath = OPT_ONNX_PATH
_ = ort.InferenceSession(ONNX_PATH, sess_options=opt_so, providers=["CPUExecutionProvider"])
print(f"✅ Graph‐optimized ONNX written to {OPT_ONNX_PATH}")

# 2) Dynamic quantization
quantize_dynamic(
    model_input=ONNX_PATH,
    model_output=DYN_QUANT_PATH,
    weight_type=QuantType.QInt8
)
print(f"✅ Dynamic‐quant ONNX written to {DYN_QUANT_PATH}")

# 3) Benchmark helper
def bench_onnx(path, label):
    sess = ort.InferenceSession(path, providers=["CPUExecutionProvider"])
    # reuse `sample` from Cell 6: a dict with numpy input_ids & attention_mask
    inputs = {
        "input_ids":    sample["input_ids"].numpy(),
        "attention_mask": sample["attention_mask"].numpy()
    }
    _, dt = time_function(lambda: sess.run(["last_hidden_state"], inputs))
    size_mb = os.path.getsize(path) / 1e6
    print(f"{label:8s} | latency {dt*1000:7.2f} ms | size {size_mb:5.1f} MB")

# 4) Run benchmarks
bench_onnx(ONNX_PATH,      "ONNX")
bench_onnx(OPT_ONNX_PATH,  "GraphOpt")
bench_onnx(DYN_QUANT_PATH, "DynQuant")


✅ Graph‐optimized ONNX written to onnx_models/embed_opt.onnx




✅ Dynamic‐quant ONNX written to onnx_models/embed_dyn.onnx
ONNX     | latency  160.80 ms | size 265.5 MB
GraphOpt | latency  261.08 ms | size 265.5 MB
DynQuant | latency  101.53 ms | size  66.7 MB


In [10]:
# Cell 8: ExecutionProvider Comparison on Dynamic‐Quantized ONNX Model

# List of providers to test
PROVIDERS = [
    "CPUExecutionProvider",
    "CUDAExecutionProvider",
    # "TensorrtExecutionProvider",
    # "OpenVINOExecutionProvider"
]

# Path to the dynamic-quantized ONNX model from Cell 7
DYN_QUANT_PATH = os.path.join(ONNX_DIR, "embed_dyn.onnx")

# Benchmark on each available provider
for prov in PROVIDERS:
    try:
        sess = ort.InferenceSession(DYN_QUANT_PATH, providers=[prov])
    except Exception as e:
        print(f"{prov:25s} not available: {e}")
        continue

    # Reuse the same sample inputs from Cell 6
    inputs = {
        "input_ids":    sample["input_ids"].numpy(),
        "attention_mask": sample["attention_mask"].numpy()
    }

    _, dt = time_function(lambda: sess.run(["last_hidden_state"], inputs))
    print(f"{prov:25s} | latency {dt*1000:7.2f} ms")


CPUExecutionProvider      | latency   59.99 ms


[0;93m2025-05-11 19:21:21.002371316 [W:onnxruntime:, transformer_memcpy.cc:83 ApplyImpl] 90 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-05-11 19:21:21.004418169 [W:onnxruntime:, session_state.cc:1263 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-05-11 19:21:21.004428495 [W:onnxruntime:, session_state.cc:1265 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


CUDAExecutionProvider     | latency  409.42 ms


In [11]:
# Cell 9: End-to-end FAISS search latency
# build index
embs = embed_model.encode(CORPUS, convert_to_numpy=True)
index = build_faiss_index(embs)
# query
qe = embed_model.encode([QUERIES[0]], convert_to_numpy=True)
start = time.perf_counter()
D,I = index.search(qe, k=3)
print("FAISS search + encode latency: ", (time.perf_counter()-start)*1000, "ms")


FAISS search + encode latency:  2.4162280024029315 ms


In [None]:
# Benchmarking Cells

The following cells are dedicated to benchmarking the performance of the endpoint its performance.

In [16]:
!pip install -q aiohttp nest_asyncio tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [21]:
import asyncio, json, statistics, time
from pathlib import Path

import aiohttp, nest_asyncio, numpy as np
from tqdm.notebook import tqdm  # nice progress bars in Jupyter

nest_asyncio.apply()            # allows nested event‑loops in notebooks


In [22]:
import httpx

async def bench_endpoint(url: str,
                         requests: list[dict],
                         concurrency: int = 1,
                         desc: str = "") -> dict:
    q = asyncio.Queue()
    for r in requests:            # push all payloads
        q.put_nowait(r)

    latencies = []

    async def _worker(queue, endpoint, times):
        while not queue.empty():
            payload = await queue.get()
            t0 = time.perf_counter()
            async with httpx.AsyncClient() as client:
                await client.post(endpoint, json=payload, timeout=60)
            times.append(time.perf_counter() - t0)

    # launch workers
    tasks = [asyncio.create_task(_worker(q, url, latencies))
             for _ in range(concurrency)]

    start = time.perf_counter()
    await asyncio.gather(*tasks)         # <‑‑ await only the gather
    total = time.perf_counter() - start

    return {
        "concurrency": concurrency,
        "requests"   : len(requests),
        "total_s"    : total,
        "rps"        : len(requests)/total,
        "p50_ms"     : 1e3*np.percentile(latencies, 50),
        "p95_ms"     : 1e3*np.percentile(latencies, 95),
        "p99_ms"     : 1e3*np.percentile(latencies, 99),
    }


In [23]:
# ── Cell 3 – prepare test inputs  (no JSONL file needed) ────────────────
import random

# Three representative research‑style queries
sample_queries = [
    "Graph neural networks for molecular property prediction",
    "Uncertainty estimation methods in Bayesian deep learning",
    "Efficient transformer architectures for long sequence modeling",
]

# Build FastAPI payloads → one request per query
payloads = [{"texts": [q]} for q in sample_queries]

# Optional: simulate perf_analyzer’s `-b` (batch‑size) flag
BATCH = 1                 # set >1 if you want to duplicate texts per request
if BATCH > 1:
    payloads = [{"texts": p["texts"] * BATCH} for p in payloads]

# (Optional) shuffle to avoid ordering bias in repeated runs
random.shuffle(payloads)

print(payloads)


[{'texts': ['Uncertainty estimation methods in Bayesian deep learning']}, {'texts': ['Efficient transformer architectures for long sequence modeling']}, {'texts': ['Graph neural networks for molecular property prediction']}]


In [24]:
url = "http://localhost:8000/embed"   # FastAPI endpoint
results = []

for c in range(1, 17):                # 1 .. 16
    res = await bench_endpoint(url, payloads, concurrency=c,
                               desc=f"conc={c}")
    results.append(res)

import pandas as pd, rich
df = pd.DataFrame(results)
rich.print(df)        # pretty table
